llama : restore intended k-quants mixes for MoE models (#4872)

author Kawrakow <redacted>

Thu, 11 Jan 2024 19:43:15 +0000 (20:43 +0100)

committer GitHub <redacted>

Thu, 11 Jan 2024 19:43:15 +0000 (21:43 +0200)
author Kawrakow <redacted>
Thu, 11 Jan 2024 19:43:15 +0000 (20:43 +0100)
committer GitHub <redacted>
Thu, 11 Jan 2024 19:43:15 +0000 (21:43 +0200)
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp

index d27ea5e9132fdce8fdb31266ba6b0fc2192d87c6..f878f6911420ac6ce12c647bc5a336fbf22607a5 100644 (file)
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -18,6 +18,7 @@ static const std::vector<struct quant_option> QUANT_OPTIONS = {
      { "Q5_0",   LLAMA_FTYPE_MOSTLY_Q5_0,   " 4.33G, +0.0683 ppl @ LLaMA-v1-7B", },
      { "Q5_1",   LLAMA_FTYPE_MOSTLY_Q5_1,   " 4.70G, +0.0349 ppl @ LLaMA-v1-7B", },
      { "Q2_K",   LLAMA_FTYPE_MOSTLY_Q2_K,   " 2.63G, +0.6717 ppl @ LLaMA-v1-7B", },
+    { "Q2_K_S", LLAMA_FTYPE_MOSTLY_Q2_K_S, " 2.16G, +9.0634 ppl @ LLaMA-v1-7B", },
      { "Q3_K",   LLAMA_FTYPE_MOSTLY_Q3_K_M, "alias for Q3_K_M" },
      { "Q3_K_S", LLAMA_FTYPE_MOSTLY_Q3_K_S, " 2.75G, +0.5551 ppl @ LLaMA-v1-7B", },
      { "Q3_K_M", LLAMA_FTYPE_MOSTLY_Q3_K_M, " 3.07G, +0.2496 ppl @ LLaMA-v1-7B", },
diff --git a/llama.cpp b/llama.cpp

index bd219d49c7ac72ca28eacdf579cd59220b2719b4..d39ff94c7fae696ccba29c25cc5e16b09588fd77 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -2586,7 +2586,8 @@ static std::string llama_model_ftype_name(llama_ftype ftype) {
          case LLAMA_FTYPE_MOSTLY_Q8_0: return "Q8_0";
  
          // K-quants
-        case LLAMA_FTYPE_MOSTLY_Q2_K:   return "Q2_K";
+        case LLAMA_FTYPE_MOSTLY_Q2_K:   return "Q2_K - Medium";
+        case LLAMA_FTYPE_MOSTLY_Q2_K_S: return "Q2_K - Small";
          case LLAMA_FTYPE_MOSTLY_Q3_K_S: return "Q3_K - Small";
          case LLAMA_FTYPE_MOSTLY_Q3_K_M: return "Q3_K - Medium";
          case LLAMA_FTYPE_MOSTLY_Q3_K_L: return "Q3_K - Large";
@@ -8955,10 +8956,13 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
              // TODO: explore better strategies
              new_type = GGML_TYPE_Q8_0;
          }
-    } else if (name.find("ffn_down.weight") != std::string::npos) {
+    } else if (name.find("ffn_down") != std::string::npos) {
          if      (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S) {
+            if (qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) new_type = GGML_TYPE_Q4_K;
+        }
          else if (ftype == LLAMA_FTYPE_MOSTLY_Q3_K_M) {
-            new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q5_K
+            new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q5_K
                       : arch != LLM_ARCH_FALCON || use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q4_K
                       : GGML_TYPE_Q3_K;
          }
@@ -8967,14 +8971,14 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
          }
          else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) {
              if (arch == LLM_ARCH_FALCON) {
-                new_type = qs.i_feed_forward_w2 < 2 ? GGML_TYPE_Q6_K :
+                new_type = qs.i_feed_forward_w2 < qs.n_feed_forward_w2/16 ? GGML_TYPE_Q6_K :
                             use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2) ? GGML_TYPE_Q5_K : GGML_TYPE_Q4_K;
              } else {
                  if (use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
              }
          }
          else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M && use_more_bits(qs.i_feed_forward_w2, qs.n_feed_forward_w2)) new_type = GGML_TYPE_Q6_K;
-        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < 4) {
+        else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_S && arch != LLM_ARCH_FALCON && qs.i_feed_forward_w2 < qs.n_feed_forward_w2/8) {
              new_type = GGML_TYPE_Q5_K;
          }
          ++qs.i_feed_forward_w2;
@@ -8992,9 +8996,10 @@ static ggml_type get_k_quant_type(quantize_state_internal & qs, ggml_type new_ty
          else if (ftype == LLAMA_FTYPE_MOSTLY_Q4_K_M) new_type = GGML_TYPE_Q5_K;
          else if (ftype == LLAMA_FTYPE_MOSTLY_Q5_K_M) new_type = GGML_TYPE_Q6_K;
      }
-    else if (name.find("ffn_gate.weight") != std::string::npos || name.find("ffn_up.weight") != std::string::npos) {
-        if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
-    }
+    // IK: let's remove this, else Q2_K is almost the same as Q3_K_S
+    //else if (name.find("ffn_gate") != std::string::npos || name.find("ffn_up") != std::string::npos) {
+    //    if (ftype == LLAMA_FTYPE_MOSTLY_Q2_K) new_type = GGML_TYPE_Q3_K;
+    //}
      // This can be used to reduce the size of the Q5_K_S model.
      // The associated PPL increase is fully in line with the size reduction
      //else {
@@ -9043,6 +9048,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
  
          // K-quants
          case LLAMA_FTYPE_MOSTLY_Q2_K:   quantized_type = GGML_TYPE_Q2_K; break;
+        case LLAMA_FTYPE_MOSTLY_Q2_K_S: quantized_type = GGML_TYPE_Q2_K; break;
          case LLAMA_FTYPE_MOSTLY_Q3_K_S:
          case LLAMA_FTYPE_MOSTLY_Q3_K_M:
          case LLAMA_FTYPE_MOSTLY_Q3_K_L: quantized_type = GGML_TYPE_Q3_K; break;
@@ -9101,7 +9107,7 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
          if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
              ++qs.n_attention_wv;
          }
-        else if (name.find("ffn_down.weight") != std::string::npos) {
+        else if (name.find("ffn_down") != std::string::npos) {
              ++qs.n_feed_forward_w2;
          }
      }
diff --git a/llama.h b/llama.h

index 6fde113ffd4589369e95cf276e170601af2994de..43d41b8f642b57f42e9fcb63b2d962654ea374be 100644 (file)
--- a/llama.h
+++ b/llama.h
@@ -105,6 +105,7 @@ extern "C" {
          LLAMA_FTYPE_MOSTLY_Q6_K          = 18, // except 1d tensors
          LLAMA_FTYPE_MOSTLY_IQ2_XXS       = 19, // except 1d tensors
          LLAMA_FTYPE_MOSTLY_IQ2_XS        = 20, // except 1d tensors
+        LLAMA_FTYPE_MOSTLY_Q2_K_S        = 21, // except 1d tensors
  
          LLAMA_FTYPE_GUESSED = 1024, // not specified in the model file
      };
author	Kawrakow <redacted>
	Thu, 11 Jan 2024 19:43:15 +0000 (20:43 +0100)
committer	GitHub <redacted>
	Thu, 11 Jan 2024 19:43:15 +0000 (21:43 +0200)
examples/quantize/quantize.cpp		patch \| blob \| history
llama.cpp		patch \| blob \| history
llama.h		patch \| blob \| history