llama : make Qwen2MoE QKV bias optional (#12477)

author Sigbjørn Skjæret <redacted>

Thu, 20 Mar 2025 11:49:59 +0000 (12:49 +0100)

committer GitHub <redacted>

Thu, 20 Mar 2025 11:49:59 +0000 (12:49 +0100)
author Sigbjørn Skjæret <redacted>
Thu, 20 Mar 2025 11:49:59 +0000 (12:49 +0100)
committer GitHub <redacted>
Thu, 20 Mar 2025 11:49:59 +0000 (12:49 +0100)
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index 17af8cc30b0cbfb378c151203b58f32ea2eb6455..cd7e0a0c4dbf8b7f83337561b5fa2152b68c84f4 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -2210,9 +2210,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                          layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  
                          // optional bias tensors
-                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);
-                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
-                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  
                          layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  
@@ -6193,16 +6193,25 @@ struct llm_build_qwen2moe : public llm_graph_context {
              {
                  // compute Q and K and RoPE them
                  ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
                  cb(Qcur, "Qcur", il);
+                if (model.layers[il].bq) {
+                    Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                    cb(Qcur, "Qcur", il);
+                }
  
                  ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
                  cb(Kcur, "Kcur", il);
+                if (model.layers[il].bk) {
+                    Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                    cb(Kcur, "Kcur", il);
+                }
  
                  ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
                  cb(Vcur, "Vcur", il);
+                if (model.layers[il].bv) {
+                    Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                    cb(Vcur, "Vcur", il);
+                }
  
                  Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
                  Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
author	Sigbjørn Skjæret <redacted>
	Thu, 20 Mar 2025 11:49:59 +0000 (12:49 +0100)
committer	GitHub <redacted>
	Thu, 20 Mar 2025 11:49:59 +0000 (12:49 +0100)