llama-quant : fix the verification of attention layers for encoder-decoder models...

author Jie Fu (傅杰) <redacted>

Wed, 17 Sep 2025 07:30:55 +0000 (15:30 +0800)

committer GitHub <redacted>

Wed, 17 Sep 2025 07:30:55 +0000 (09:30 +0200)
author Jie Fu (傅杰) <redacted>
Wed, 17 Sep 2025 07:30:55 +0000 (15:30 +0800)
committer GitHub <redacted>
Wed, 17 Sep 2025 07:30:55 +0000 (09:30 +0200)
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp

index c93e8065a84c1c3e919197ae820a07390c93f865..97228b2a693241045d3888736ddc06776c8c2506 100644 (file)
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -725,7 +725,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
          // attention layers have a non-zero number of kv heads
          int32_t n_attn_layer = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
          if (llama_model_has_encoder(&model)) {
-            n_attn_layer *= 3;
+            // now n_attn_layer is the number of attention layers in the encoder
+            // for each decoder block, there are 2 attention layers
+            n_attn_layer += 2 * model.hparams.dec_n_layer;
          }
          GGML_ASSERT((qs.n_attention_wv == n_attn_layer - pruned_attention_w) && "n_attention_wv is unexpected");
      }
author	Jie Fu (傅杰) <redacted>
	Wed, 17 Sep 2025 07:30:55 +0000 (15:30 +0800)
committer	GitHub <redacted>
	Wed, 17 Sep 2025 07:30:55 +0000 (09:30 +0200)