Make Q3_K_S be the same as olf Q3_K_L for Mixtral-8x7B (#4906)

author Kawrakow <redacted>

Sun, 14 Jan 2024 07:44:30 +0000 (09:44 +0200)

committer GitHub <redacted>

Sun, 14 Jan 2024 07:44:30 +0000 (09:44 +0200)
author Kawrakow <redacted>
Sun, 14 Jan 2024 07:44:30 +0000 (09:44 +0200)
committer GitHub <redacted>
Sun, 14 Jan 2024 07:44:30 +0000 (09:44 +0200)
diff --git a/llama.cpp b/llama.cpp

index 66494974abb6fc8d1d7de3d55e19a1c7225fb6e0..8e20e72a232145a0baaa6ac196a17e62397c59d7 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -8489,9 +8489,16 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
          ++qs.i_feed_forward_w2;
      } else if (name.find("attn_output.weight") != std::string::npos) {
          if (arch != LLM_ARCH_FALCON) {
-            if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K  ) new_type = GGML_TYPE_Q3_K;
-            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
-            else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
+            if (qs.model.hparams.n_expert == 8) {
+                if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K   || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M ||
+                    ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S || ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
+                    new_type = GGML_TYPE_Q5_K;
+                }
+            } else {
+                if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K  ) new_type = GGML_TYPE_Q3_K;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) new_type = GGML_TYPE_Q4_K;
+                else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q5_K;
+            }
          } else {
              if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_L) new_type = GGML_TYPE_Q4_K;
          }
author	Kawrakow <redacted>
	Sun, 14 Jan 2024 07:44:30 +0000 (09:44 +0200)
committer	GitHub <redacted>
	Sun, 14 Jan 2024 07:44:30 +0000 (09:44 +0200)