From: Ed Addario Date: Thu, 31 Jul 2025 19:32:18 +0000 (+0100) Subject: quantize : skip tensor override when in fallback mode (#14995) X-Git-Tag: upstream/0.0.6073~21 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=daf2dd788066b8b239cb7f68210e090c2124c199;p=pkg%2Fggml%2Fsources%2Fllama.cpp quantize : skip tensor override when in fallback mode (#14995) --- diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp index a00af7a1..0756bf09 100644 --- a/src/llama-quant.cpp +++ b/src/llama-quant.cpp @@ -875,9 +875,10 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: // get more optimal quantization type based on the tensor shape, layer, etc. if (!params->pure && ggml_is_quantized(default_type)) { + int fallback = qs.n_fallback; new_type = llama_tensor_get_type(qs, new_type, tensor, ftype); - // unless the user specifies a type - if (params->tensor_types) { + // unless the user specifies a type, and the tensor geometry will not require fallback quantisation + if (params->tensor_types && qs.n_fallback - fallback == 0) { const std::vector & tensor_types = *static_cast *>(params->tensor_types); const std::string tensor_name(tensor->name); for (const auto & [tname, qtype] : tensor_types) { @@ -890,7 +891,6 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std:: } } } - if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) { new_type = params->token_embedding_type; }