llama : fix sanity checks during quantization (#17721)

author Georgi Gerganov <redacted>

Thu, 4 Dec 2025 08:33:42 +0000 (10:33 +0200)

committer GitHub <redacted>

Thu, 4 Dec 2025 08:33:42 +0000 (10:33 +0200)
author Georgi Gerganov <redacted>
Thu, 4 Dec 2025 08:33:42 +0000 (10:33 +0200)
committer GitHub <redacted>
Thu, 4 Dec 2025 08:33:42 +0000 (10:33 +0200)
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp

index 0b23eaef3a8bfaab7c4208610949c655af582d23..764833749ece0b586b8d2805cd7fcddb6c87b00b 100644 (file)
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -726,21 +726,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
      // sanity checks for models that have attention layers
      if (qs.n_attention_wv != 0 && !is_clip_model)
      {
-        const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
-        // attention layers have a non-zero number of kv heads
-        int32_t n_layer_attn = model.hparams.n_layer - std::count(n_head_kv_iter, n_head_kv_iter + model.hparams.n_layer, 0);
+        int32_t n_layer_all = model.hparams.n_layer;
          if (llama_model_has_encoder(&model)) {
-            // now n_layer_attn is the number of attention layers in the encoder
+            // now n_layer_all is the number of attention layers in the encoder
              // for each decoder block, there are 2 attention layers
-            n_layer_attn += 2 * model.hparams.dec_n_layer;
+            n_layer_all += 2 * model.hparams.dec_n_layer;
          }
  
          // note: for linear-attention models (such as Qwen3 Next) this is the number of linear layers
          const int32_t n_layer_recr = std::count(model.hparams.recurrent_layer_arr.begin(), model.hparams.recurrent_layer_arr.end(), true);
  
-        LLAMA_LOG_INFO("%s: n_layer_attn = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_attn, n_layer_recr, pruned_attention_w);
+        LLAMA_LOG_INFO("%s: n_layer_all = %d, n_layer_recr = %d, pruned_attention_w = %d\n", __func__, n_layer_all, n_layer_recr, pruned_attention_w);
  
-        GGML_ASSERT((qs.n_attention_wv == n_layer_attn - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
+        GGML_ASSERT((qs.n_attention_wv == n_layer_all - pruned_attention_w - n_layer_recr) && "n_attention_wv is unexpected");
      }
  
      size_t total_size_org = 0;
author	Georgi Gerganov <redacted>
	Thu, 4 Dec 2025 08:33:42 +0000 (10:33 +0200)
committer	GitHub <redacted>
	Thu, 4 Dec 2025 08:33:42 +0000 (10:33 +0200)