quantize : skip tensor override when in fallback mode (#14995)

author Ed Addario <redacted>

Thu, 31 Jul 2025 19:32:18 +0000 (20:32 +0100)

committer GitHub <redacted>

Thu, 31 Jul 2025 19:32:18 +0000 (21:32 +0200)
author Ed Addario <redacted>
Thu, 31 Jul 2025 19:32:18 +0000 (20:32 +0100)
committer GitHub <redacted>
Thu, 31 Jul 2025 19:32:18 +0000 (21:32 +0200)
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp

index a00af7a1d1758855ec5f8febba2c4a0015cb7710..0756bf09b8841ba7d85f67d7f4e8fdd277b777d1 100644 (file)
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -875,9 +875,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
  
              // get more optimal quantization type based on the tensor shape, layer, etc.
              if (!params->pure && ggml_is_quantized(default_type)) {
+                int fallback = qs.n_fallback;
                  new_type = llama_tensor_get_type(qs, new_type, tensor, ftype);
-                // unless the user specifies a type
-                if (params->tensor_types) {
+                // unless the user specifies a type, and the tensor geometry will not require fallback quantisation
+                if (params->tensor_types && qs.n_fallback - fallback == 0) {
                      const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
                      const std::string tensor_name(tensor->name);
                      for (const auto & [tname, qtype] : tensor_types) {
@@ -890,7 +891,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                      }
                  }
              }
-
              if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                  new_type = params->token_embedding_type;
              }
author	Ed Addario <redacted>
	Thu, 31 Jul 2025 19:32:18 +0000 (20:32 +0100)
committer	GitHub <redacted>
	Thu, 31 Jul 2025 19:32:18 +0000 (21:32 +0200)