quantize : improve tensor-type pattern matching (#13033)

author Ed Addario <redacted>

Tue, 13 May 2025 17:12:31 +0000 (18:12 +0100)

committer GitHub <redacted>

Tue, 13 May 2025 17:12:31 +0000 (19:12 +0200)
author Ed Addario <redacted>
Tue, 13 May 2025 17:12:31 +0000 (18:12 +0100)
committer GitHub <redacted>
Tue, 13 May 2025 17:12:31 +0000 (19:12 +0200)
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp

index 820d5128e29ba700702ef31447be9ddeb7c52d53..159b1307a4c5d70ba49d476743350feaf0f7a231 100644 (file)
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -14,6 +14,12 @@
  #include <thread>
  #include <unordered_map>
  
+// Quantization types. Changes to this struct must be replicated in quantize.cpp
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
  static void zeros(std::ofstream & file, size_t n) {
      char zero = 0;
      for (size_t i = 0; i < n; ++i) {
@@ -48,12 +54,6 @@ struct quantize_state_impl {
          {}
  };
  
-// changes to this struct must be replicated in quantize.cpp
-struct tensor_quantization {
-    std::string name;
-    ggml_type quant = GGML_TYPE_COUNT;
-};
-
  static void llama_tensor_dequantize_impl(
      ggml_tensor * tensor, std::vector<no_init<float>> & output, std::vector<std::thread> & workers,
      const size_t nelements, const int nthread
@@ -796,17 +796,19 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
                  // unless the user specifies a type
                  if (params->tensor_types) {
                      const std::vector<tensor_quantization> & tensor_types = *static_cast<const std::vector<tensor_quantization> *>(params->tensor_types);
+                    const std::string tensor_name(tensor->name);
                      for (const auto & [tname, qtype] : tensor_types) {
-                        if (std::regex pattern(tname); std::regex_search(tensor->name, pattern)) {
-                            if (qtype != new_type) {
-                                LLAMA_LOG_DEBUG("(overriding %s -> %s), ", ggml_type_name(new_type), ggml_type_name(qtype));
+                        if (std::regex pattern(tname); std::regex_search(tensor_name, pattern)) {
+                            if  (qtype != new_type) {
+                                LLAMA_LOG_DEBUG("(overriding %s) ", ggml_type_name(new_type));
+                                new_type = qtype;
+                                break; // if two or more types are specified for the tensor, first match wins
                              }
-                            new_type = qtype;
-                            break;
                          }
                      }
                  }
              }
+
              if (params->token_embedding_type < GGML_TYPE_COUNT && strcmp(tensor->name, "token_embd.weight") == 0) {
                  new_type = params->token_embedding_type;
              }
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp

index 0355311dc5c06ecbc6f7fc56fa72627a33434461..3f54af7c581588a6fc684fe8b47f9a6e3e512e6a 100644 (file)
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -57,6 +57,12 @@ static const std::vector<quant_option> QUANT_OPTIONS = {
      { "COPY",     LLAMA_FTYPE_ALL_F32,         "only copy tensors, no quantizing",  },
  };
  
+// Quantization types. Changes to this struct must be replicated in llama-quantize.cpp
+struct tensor_quantization {
+    std::string name;
+    ggml_type quant = GGML_TYPE_COUNT;
+};
+
  static const char * const LLM_KV_QUANTIZE_IMATRIX_FILE       = "quantize.imatrix.file";
  static const char * const LLM_KV_QUANTIZE_IMATRIX_DATASET    = "quantize.imatrix.dataset";
  static const char * const LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES  = "quantize.imatrix.entries_count";
@@ -244,56 +250,10 @@ static ggml_type parse_ggml_type(const char * arg) {
              return type;
          }
      }
-    fprintf(stderr, "%s: invalid ggml_type '%s'\n", __func__, arg);
+    fprintf(stderr, "\n%s: invalid ggml_type '%s'\n\n", __func__, arg);
      return GGML_TYPE_COUNT;
  }
  
-// Allowed tensors for arbitrary quantization with --tensor-type option
-static const std::vector<std::string> ALLOWED_TENSOR_TYPE = {
-    "attn_k",
-    "attn_kv_a_mqa",
-    "attn_kv_b",
-    "attn_o",
-    "attn_output",
-    "attn_q",
-    "attn_q_a",
-    "attn_q_b",
-    "attn_qkv",
-    "attn_v",
-    "channel_mix_key",
-    "channel_mix_receptance",
-    "channel_mix_value",
-    "cls",
-    "cls.output",
-    "cross_attn_k",
-    "cross_attn_o",
-    "cross_attn_q",
-    "cross_attn_v",
-    "ffn_act",
-    "ffn_down",
-    "ffn_down_exps",
-    "ffn_down_shexp",
-    "ffn_gate",
-    "ffn_gate_exps",
-    "ffn_gate_shexp",
-    "ffn_up",
-    "ffn_up_exps",
-    "ffn_up_shexp",
-    "ssm_in",
-    "ssm_out",
-    "time_mix_gate",
-    "time_mix_key",
-    "time_mix_output",
-    "time_mix_receptance",
-    "time_mix_value",
-};
-
-// changes to this struct must be replicated in llama-quant.cpp
-struct tensor_quantization {
-    std::string name;
-    ggml_type quant = GGML_TYPE_COUNT;
-};
-
  static bool parse_tensor_type(const char * data, std::vector<tensor_quantization> & tensor_type) {
      const char * sep = strchr(data, '=');
      if (sep == nullptr) {
@@ -306,7 +266,6 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
          printf("\n%s: missing tensor name\n\n", __func__);
          return false;
      }
-
      if (const size_t qt_len = strlen(sep); qt_len == 1) {
          printf("\n%s: missing quantization type\n\n", __func__);
          return false;
@@ -315,37 +274,15 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
      std::string tn(data, tn_len);
      std::transform(tn.begin(), tn.end(), tn.begin(), tolower);
      sep++;
-    const std::string qt(sep);
-
-    bool found = false;
-    for (const auto & allowed : ALLOWED_TENSOR_TYPE) {
-        std::string tensor;
-        tensor = tn.rfind('.') != std::string::npos ? tn.substr(tn.rfind('.') + 1) : tn;
-        // handle special case of cls.output
-        std::string cls_output = "cls.output";
-        if (tn.find(cls_output) != std::string::npos) {
-            tensor = "cls.output";
-        }
-        // check if an allowed tensor exists and it's at the end of the kv string
-        if (tensor == allowed) {
-            found = true;
-            break;
-        }
-    }
-    if (!found) {
-        printf("\n%s: invalid tensor name '%s'\n\n", __func__, tn.c_str());
-        return false;
-    }
-
-    if (parse_ggml_type(qt.c_str()) == GGML_TYPE_COUNT) {
-        printf("\n%s: invalid quantization type '%s'\n\n", __func__, qt.c_str());
-        return false;
-    }
-
      tensor_quantization tqz;
      tqz.name = tn;
-    tqz.quant = parse_ggml_type(qt.c_str());
+    tqz.quant = parse_ggml_type(sep);
      tensor_type.emplace_back(std::move(tqz));
+    if (tqz.quant == GGML_TYPE_COUNT) {
+        printf("\n%s: invalid quantization type '%s'\n\n", __func__, sep);
+        return false;
+    }
+
      return true;
  }
author	Ed Addario <redacted>
	Tue, 13 May 2025 17:12:31 +0000 (18:12 +0100)
committer	GitHub <redacted>
	Tue, 13 May 2025 17:12:31 +0000 (19:12 +0200)
src/llama-quant.cpp		patch \| blob \| history
tools/quantize/quantize.cpp		patch \| blob \| history