llama : refactor llama_model_quantize_params to expose a pure C interface (#20346)

author Ed Addario <redacted>

Wed, 1 Apr 2026 05:43:00 +0000 (06:43 +0100)

committer GitHub <redacted>

Wed, 1 Apr 2026 05:43:00 +0000 (08:43 +0300)
author Ed Addario <redacted>
Wed, 1 Apr 2026 05:43:00 +0000 (06:43 +0100)
committer GitHub <redacted>
Wed, 1 Apr 2026 05:43:00 +0000 (08:43 +0300)
diff --git a/include/llama.h b/include/llama.h

index 60e4b6b2ef7ec615e36116479382bf9ffd30ce25..a940f9d648a071b81d8f784004b1440b2fa9e573 100644 (file)
--- a/include/llama.h
+++ b/include/llama.h
@@ -380,22 +380,33 @@ extern "C" {
          size_t                            n_samplers;
      };
  
+    struct llama_model_tensor_override {
+        const char * pattern;
+        enum ggml_type type;
+    };
+
+    struct llama_model_imatrix_data {
+        const char * name;
+        const float * data;
+        size_t size;
+    };
+
      // model quantization parameters
      typedef struct llama_model_quantize_params {
-        int32_t nthread;                      // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
-        enum llama_ftype ftype;               // quantize to this llama_ftype
-        enum ggml_type output_tensor_type;    // output tensor type
-        enum ggml_type token_embedding_type;  // token embeddings tensor type
-        bool allow_requantize;                // allow quantizing non-f32/f16 tensors
-        bool quantize_output_tensor;          // quantize output.weight
-        bool only_copy;                       // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
-        bool pure;                            // quantize all tensors to the default type
-        bool keep_split;                      // quantize to the same number of shards
-        bool dry_run;                         // calculate and show the final quantization size without performing quantization
-        void * imatrix;                       // pointer to importance matrix data
-        void * kv_overrides;                  // pointer to vector containing overrides
-        void * tensor_types;                  // pointer to vector containing tensor types
-        void * prune_layers;                  // pointer to vector containing layer indices to prune
+        int32_t nthread;                                            // number of threads to use for quantizing, if <=0 will use std::thread::hardware_concurrency()
+        enum llama_ftype ftype;                                     // quantize to this llama_ftype
+        enum ggml_type output_tensor_type;                          // output tensor type
+        enum ggml_type token_embedding_type;                        // token embeddings tensor type
+        bool allow_requantize;                                      // allow quantizing non-f32/f16 tensors
+        bool quantize_output_tensor;                                // quantize output.weight
+        bool only_copy;                                             // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
+        bool pure;                                                  // quantize all tensors to the default type
+        bool keep_split;                                            // quantize to the same number of shards
+        bool dry_run;                                               // calculate and show the final quantization size without performing quantization
+        const struct llama_model_imatrix_data * imatrix;            // pointer to importance matrix data
+        const struct llama_model_kv_override * kv_overrides;        // pointer to kv overrides
+        const struct llama_model_tensor_override * tt_overrides;    // pointer to tensor overrides
+        const int32_t * prune_layers;                               // pointer to layer indices to prune
      } llama_model_quantize_params;
  
      typedef struct llama_logit_bias {
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp

index 3c8b32be08487215f00979679a01b9412eba6fe8..67e1056c53f895f8dff969fcb84b89b0d0d6e936 100644 (file)
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -84,7 +84,6 @@ static std::string remap_imatrix(const std::string & orig_name, const std::map<i
  
          for (const auto & p : mapped) {
              if (p.second == blk) {
-                LLAMA_LOG_DEBUG("(blk.%d imatrix) ", p.first);
                  return new_name.replace(match.position(1), match.length(1), std::to_string(p.first));
              }
          }
@@ -188,10 +187,9 @@ struct quantize_state_impl {
          model(model), params(params)
      {
          // compile regex patterns once - they are expensive
-        if (params->tensor_types) {
-            const auto & tensor_types = *static_cast<const std::vector<tensor_type_option> *>(params->tensor_types);
-            for (const auto & [tname, qtype] : tensor_types) {
-                tensor_type_patterns.emplace_back(std::regex(tname), qtype);
+        if (params->tt_overrides) {
+            for (const auto * p = params->tt_overrides; p->pattern != nullptr; p++) {
+                tensor_type_patterns.emplace_back(std::regex(p->pattern), p->type);
              }
          }
      }
@@ -857,12 +855,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
      constexpr bool use_mmap = false;
  #endif
  
-    llama_model_kv_override * kv_overrides = nullptr;
-    if (params->kv_overrides) {
-        auto * v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
-        kv_overrides = v->data();
-    }
-
+    const llama_model_kv_override * kv_overrides = params->kv_overrides;
      std::vector<std::string> splits = {};
      llama_model_loader ml(/*metadata*/ nullptr, /*set_tensor_data*/ nullptr, /*set_tensor_data_ud*/ nullptr,
          fname_inp, splits, /*file*/ nullptr, use_mmap, /*use_direct_io*/ false, /*check_tensors*/ true, /*no_alloc*/ false, kv_overrides, nullptr);
@@ -879,9 +872,13 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
      if (params->only_copy) {
          ftype = ml.ftype;
      }
+    std::unordered_map<std::string, std::vector<float>> i_data;
      const std::unordered_map<std::string, std::vector<float>> * imatrix_data = nullptr;
      if (params->imatrix) {
-        imatrix_data = static_cast<const std::unordered_map<std::string, std::vector<float>>*>(params->imatrix);
+        for (const llama_model_imatrix_data * p = params->imatrix; p->name != nullptr; p++) {
+            i_data.emplace(p->name, std::vector<float>(p->data, p->data + p->size));
+        }
+        imatrix_data = & i_data;
          if (imatrix_data) {
              LLAMA_LOG_INFO("\n%s: have importance matrix data with %d entries\n",
                             __func__, (int)imatrix_data->size());
@@ -902,7 +899,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
  
      std::vector<int> prune_list = {};
      if (params->prune_layers) {
-        prune_list = *static_cast<const std::vector<int> *>(params->prune_layers);
+        for (const int32_t * p = params->prune_layers; * p != -1; p++) {
+            prune_list.push_back(* p);
+        }
      }
  
      // copy the KV pairs from the input file
@@ -916,20 +915,18 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
      gguf_remove_key(ctx_out.get(), ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str());
  
      if (params->kv_overrides) {
-        const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
-        for (const auto & o : overrides) {
-            if (o.key[0] == 0) break;
-            if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
-                gguf_set_val_f32(ctx_out.get(), o.key, o.val_f64);
-            } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
+        for (const llama_model_kv_override * o = params->kv_overrides; o->key[0] != 0; ++o) {
+            if (o->tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
+                gguf_set_val_f32(ctx_out.get(), o->key, o->val_f64);
+            } else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
                  // Setting type to UINT32. See https://github.com/ggml-org/llama.cpp/pull/14182 for context
-                gguf_set_val_u32(ctx_out.get(), o.key, (uint32_t)std::abs(o.val_i64));
-            } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
-                gguf_set_val_bool(ctx_out.get(), o.key, o.val_bool);
-            } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
-                gguf_set_val_str(ctx_out.get(), o.key, o.val_str);
+                gguf_set_val_u32(ctx_out.get(), o->key, (uint32_t)std::abs(o->val_i64));
+            } else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
+                gguf_set_val_bool(ctx_out.get(), o->key, o->val_bool);
+            } else if (o->tag == LLAMA_KV_OVERRIDE_TYPE_STR) {
+                gguf_set_val_str(ctx_out.get(), o->key, o->val_str);
              } else {
-                LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
+                LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o->key);
              }
          }
      }
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp

index 24e0a4662aefd6e6d5415c2485e687b985b95dba..b727c9dd39f35f4f78bd195a6c8200ecbffbda6e 100644 (file)
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -13,13 +13,10 @@
  #include <unordered_map>
  #include <map>
  #include <fstream>
-#include <cmath>
-#include <cctype>
-#include <algorithm>
  #include <filesystem>
  
  // result of parsing --tensor-type option
-// (changes to this struct must be reflected in src/llama-quant.cpp)
+// changes to this struct must also be reflected in src/llama-quant.cpp
  struct tensor_type_option {
      std::string name;
      ggml_type type = GGML_TYPE_COUNT;
@@ -491,7 +488,6 @@ static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers
  
  int main(int argc, char ** argv) {
      std::setlocale(LC_NUMERIC, "C");
-
      if (argc < 3) {
          usage(argv[0]);
      }
@@ -584,8 +580,16 @@ int main(int argc, char ** argv) {
      std::vector<std::string> imatrix_datasets;
      std::unordered_map<std::string, std::vector<float>> imatrix_data;
      int m_last_call = prepare_imatrix(imatrix_file, imatrix_datasets, included_weights, excluded_weights, imatrix_data);
+
+    std::vector<llama_model_imatrix_data> i_data;
+    std::vector<llama_model_tensor_override> t_override;
      if (!imatrix_data.empty()) {
-        params.imatrix = &imatrix_data;
+        i_data.reserve(imatrix_data.size() + 1);
+        for (const auto & kv : imatrix_data) {
+            i_data.push_back({kv.first.c_str(), kv.second.data(), kv.second.size()});
+        }
+        i_data.push_back({nullptr, nullptr, 0});  // array terminator
+        params.imatrix = i_data.data();
          {
              llama_model_kv_override kvo;
              std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_FILE);
@@ -603,7 +607,6 @@ int main(int argc, char ** argv) {
              kvo.val_str[127] = '\0';
              kv_overrides.emplace_back(std::move(kvo));
          }
-
          {
              llama_model_kv_override kvo;
              std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_ENTRIES);
@@ -611,7 +614,6 @@ int main(int argc, char ** argv) {
              kvo.val_i64 = imatrix_data.size();
              kv_overrides.emplace_back(std::move(kvo));
          }
-
          if (m_last_call > 0) {
              llama_model_kv_override kvo;
              std::strcpy(kvo.key, LLM_KV_QUANTIZE_IMATRIX_N_CHUNKS);
@@ -623,13 +625,19 @@ int main(int argc, char ** argv) {
      if (!kv_overrides.empty()) {
          kv_overrides.emplace_back();
          kv_overrides.back().key[0] = 0;
-        params.kv_overrides = &kv_overrides;
+        params.kv_overrides = kv_overrides.data();
      }
      if (!tensor_type_opts.empty()) {
-        params.tensor_types = &tensor_type_opts;
+        t_override.reserve(tensor_type_opts.size() + 1);
+        for (const auto & tt : tensor_type_opts) {
+            t_override.push_back({tt.name.c_str(), tt.type});
+        }
+        t_override.push_back({nullptr, GGML_TYPE_COUNT});  // array terminator
+        params.tt_overrides = t_override.data();
      }
      if (!prune_layers.empty()) {
-        params.prune_layers = &prune_layers;
+        prune_layers.push_back(-1);  // array terminator
+        params.prune_layers = prune_layers.data();
      }
  
      llama_backend_init();
author	Ed Addario <redacted>
	Wed, 1 Apr 2026 05:43:00 +0000 (06:43 +0100)
committer	GitHub <redacted>
	Wed, 1 Apr 2026 05:43:00 +0000 (08:43 +0300)
include/llama.h		patch \| blob \| history
src/llama-quant.cpp		patch \| blob \| history
tools/quantize/quantize.cpp		patch \| blob \| history