llama : fix Baichuan2 13B (#6092)

author slaren <redacted>

Fri, 15 Mar 2024 21:14:16 +0000 (22:14 +0100)

committer GitHub <redacted>

Fri, 15 Mar 2024 21:14:16 +0000 (23:14 +0200)
author slaren <redacted>
Fri, 15 Mar 2024 21:14:16 +0000 (22:14 +0100)
committer GitHub <redacted>
Fri, 15 Mar 2024 21:14:16 +0000 (23:14 +0200)
diff --git a/llama.cpp b/llama.cpp

index 52bd718ba89a5769dd6e9581689f1341abc2ed0d..e4db288ddd907d9700785c1bfd5572bd9eb17a25 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -6000,7 +6000,7 @@ struct llm_build_context {
          inpL = llm_build_inp_embd(ctx0, lctx, hparams, batch, model.tok_embd, cb);
  
          // inp_pos - contains the positions
-        struct ggml_tensor * inp_pos = build_inp_pos();
+        struct ggml_tensor * inp_pos = model.type == MODEL_7B ? build_inp_pos() : nullptr;
  
          // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
          struct ggml_tensor * KQ_mask = build_inp_KQ_mask();
@@ -6050,7 +6050,6 @@ struct llm_build_context {
                  cb(Qcur, "Qcur", il);
                  cb(Kcur, "Kcur", il);
  
-
                  cur = llm_build_kv(ctx0, model, hparams, kv_self, gf,
                          model.layers[il].wo, NULL,
                          Kcur, Vcur, Qcur, KQ_mask, KQ_pos, n_ctx, n_tokens, kv_head, n_kv, 1.0f/sqrtf(float(n_embd_head)), cb, il);