From: Georgi Gerganov Date: Thu, 22 Feb 2024 21:23:46 +0000 (+0200) Subject: gemma : use more bits for the token_embd.weight tensor (#5650) X-Git-Tag: upstream/0.0.4488~2239 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=96633eeca1265ed03e57230de54032041c58f9cd;p=pkg%2Fggml%2Fsources%2Fllama.cpp gemma : use more bits for the token_embd.weight tensor (#5650) * gemma : use Q8_0 for the token_embd.weight tensor * llama : quantize token_embd.weight using output type --- diff --git a/llama.cpp b/llama.cpp index 7770fa0e..2ebd40df 100644 --- a/llama.cpp +++ b/llama.cpp @@ -10498,7 +10498,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty return std::make_pair(i_layer, n_layer); }; - if (name == tn(LLM_TENSOR_OUTPUT, "weight")) { + // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings + // with the quantization of the output tensor + if (name == tn(LLM_TENSOR_OUTPUT, "weight") || + (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) { int nx = tensor->ne[0]; if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) { new_type = GGML_TYPE_Q8_0;