llama : fix not enough space in buffer with Qwen (#5086)

author slaren <redacted>

Mon, 22 Jan 2024 22:42:41 +0000 (23:42 +0100)

committer GitHub <redacted>

Mon, 22 Jan 2024 22:42:41 +0000 (23:42 +0100)
author slaren <redacted>
Mon, 22 Jan 2024 22:42:41 +0000 (23:42 +0100)
committer GitHub <redacted>
Mon, 22 Jan 2024 22:42:41 +0000 (23:42 +0100)
diff --git a/llama.cpp b/llama.cpp

index 8c906a22f0ba90092a9477dea8b86a8b8ad16e17..f6f1ec0f403a89eaebe33e612ebac033b213601e 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -4440,9 +4440,9 @@ static struct ggml_tensor * llm_build_kv(
  
      // these nodes are added to the graph together so that they are not reordered
      // by doing so, the number of splits in the graph is reduced
+    ggml_build_forward_expand(graph, q_cur);
      ggml_build_forward_expand(graph, k_cur);
      ggml_build_forward_expand(graph, v_cur);
-    ggml_build_forward_expand(graph, q_cur);
  
      llm_build_kv_store(ctx, hparams, kv, graph, k_cur, v_cur, n_ctx, n_tokens, kv_head, cb, il);