llama : fix command-r inference when omitting outputs (#6367)

author compilade <redacted>

Thu, 28 Mar 2024 12:05:54 +0000 (08:05 -0400)

committer GitHub <redacted>

Thu, 28 Mar 2024 12:05:54 +0000 (14:05 +0200)
author compilade <redacted>
Thu, 28 Mar 2024 12:05:54 +0000 (08:05 -0400)
committer GitHub <redacted>
Thu, 28 Mar 2024 12:05:54 +0000 (14:05 +0200)
diff --git a/llama.cpp b/llama.cpp

index 892d46fbcfcecf7752c5bd4794357ba23bf02cd8..77ec9b7a1935d6ba3b6d422132b13ddc0fd946af 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -9152,8 +9152,9 @@ struct llm_build_context {
              if (il == n_layer - 1) {
                  // skip computing output for unused tokens
                  struct ggml_tensor * inp_out_ids = build_inp_out_ids();
-                cur  = ggml_get_rows(ctx0,  cur, inp_out_ids);
-                inpL = ggml_get_rows(ctx0, inpL, inp_out_ids);
+                cur     = ggml_get_rows(ctx0,     cur, inp_out_ids);
+                inpL    = ggml_get_rows(ctx0,    inpL, inp_out_ids);
+                ffn_inp = ggml_get_rows(ctx0, ffn_inp, inp_out_ids);
              }
  
              struct ggml_tensor * attn_out = cur;
author	compilade <redacted>
	Thu, 28 Mar 2024 12:05:54 +0000 (08:05 -0400)
committer	GitHub <redacted>
	Thu, 28 Mar 2024 12:05:54 +0000 (14:05 +0200)