gemma : use more bits for the token_embd.weight tensor (#5650)

author Georgi Gerganov <redacted>

Thu, 22 Feb 2024 21:23:46 +0000 (23:23 +0200)

committer GitHub <redacted>

Thu, 22 Feb 2024 21:23:46 +0000 (23:23 +0200)
author Georgi Gerganov <redacted>
Thu, 22 Feb 2024 21:23:46 +0000 (23:23 +0200)
committer GitHub <redacted>
Thu, 22 Feb 2024 21:23:46 +0000 (23:23 +0200)
diff --git a/llama.cpp b/llama.cpp

index 7770fa0e8f6fa2fbc23397f2a579a6e0b0f8838b..2ebd40df234f0aacc6c126c18df979a2a138ad06 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -10498,7 +10498,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
          return std::make_pair(i_layer, n_layer);
      };
  
-    if (name == tn(LLM_TENSOR_OUTPUT, "weight")) {
+    // for arches that share the same tensor between the token embeddings and the output, we quantize the token embeddings
+    // with the quantization of the output tensor
+    if (name == tn(LLM_TENSOR_OUTPUT, "weight") ||
+        (LLM_TENSOR_NAMES.at(arch).find(LLM_TENSOR_OUTPUT) == LLM_TENSOR_NAMES.at(arch).end() && name == "token_embd.weight")) {
          int nx = tensor->ne[0];
          if (arch == LLM_ARCH_FALCON || nx % QK_K != 0) {
              new_type = GGML_TYPE_Q8_0;
author	Georgi Gerganov <redacted>
	Thu, 22 Feb 2024 21:23:46 +0000 (23:23 +0200)
committer	GitHub <redacted>
	Thu, 22 Feb 2024 21:23:46 +0000 (23:23 +0200)