llama : correction of the attn.v.weight quantization for IQ3_XS (#6209)

author Nexesenex <redacted>

Fri, 22 Mar 2024 13:32:02 +0000 (14:32 +0100)

committer GitHub <redacted>

Fri, 22 Mar 2024 13:32:02 +0000 (15:32 +0200)
author Nexesenex <redacted>
Fri, 22 Mar 2024 13:32:02 +0000 (14:32 +0100)
committer GitHub <redacted>
Fri, 22 Mar 2024 13:32:02 +0000 (15:32 +0200)
diff --git a/llama.cpp b/llama.cpp

index 9de4a86022b1a9a63e225d8007290ffc5dcb10a3..91bd6b8d070e9f7de313c6e99fd7ea892af06ba7 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -12027,13 +12027,7 @@ static ggml_type llama_tensor_get_type(quantize_state_internal & qs, ggml_type n
          else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_XXS) {
              new_type = qs.model.hparams.n_gqa() >= 4 ? GGML_TYPE_Q4_K : !qs.has_imatrix ? GGML_TYPE_IQ3_S : GGML_TYPE_IQ3_XXS;
          }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
-            new_type = GGML_TYPE_Q4_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
-            new_type = GGML_TYPE_Q4_K;
-        }
-        else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_S && qs.model.hparams.n_gqa() >= 4) {
+        else if ((ftype == LLAMA_FTYPE_MOSTLY_IQ3_XS || ftype == LLAMA_FTYPE_MOSTLY_IQ3_S) && qs.model.hparams.n_gqa() >= 4) {
              new_type = GGML_TYPE_Q4_K;
          }
          else if (ftype == LLAMA_FTYPE_MOSTLY_IQ3_M) {
author	Nexesenex <redacted>
	Fri, 22 Mar 2024 13:32:02 +0000 (14:32 +0100)
committer	GitHub <redacted>
	Fri, 22 Mar 2024 13:32:02 +0000 (15:32 +0200)