quantize: add option --tensor-type-file to llama-quantize (#18572)

author EugeoSynthesisThirtyTwo <redacted>

Sat, 31 Jan 2026 03:39:21 +0000 (04:39 +0100)

committer GitHub <redacted>

Sat, 31 Jan 2026 03:39:21 +0000 (11:39 +0800)
author EugeoSynthesisThirtyTwo <redacted>
Sat, 31 Jan 2026 03:39:21 +0000 (04:39 +0100)
committer GitHub <redacted>
Sat, 31 Jan 2026 03:39:21 +0000 (11:39 +0800)
diff --git a/tools/quantize/quantize.cpp b/tools/quantize/quantize.cpp

index 881f4b3dd9e250f475b4f581e61c53e7004716df..0709e0bda0279464237dd654d5b1271d183406a0 100644 (file)
--- a/tools/quantize/quantize.cpp
+++ b/tools/quantize/quantize.cpp
@@ -119,7 +119,7 @@ static bool try_parse_ftype(const std::string & ftype_str_in, llama_ftype & ftyp
  [[noreturn]]
  static void usage(const char * executable) {
      printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
-    printf("       [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--prune-layers] [--keep-split] [--override-kv]\n");
+    printf("       [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file] [--prune-layers] [--keep-split] [--override-kv]\n");
      printf("       model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
      printf("  --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
      printf("  --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
@@ -131,6 +131,8 @@ static void usage(const char * executable) {
      printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
      printf("  --tensor-type TENSOR=TYPE: quantize this tensor to this ggml_type. example: --tensor-type attn_q=q8_0\n");
      printf("      Advanced option to selectively quantize tensors. May be specified multiple times.\n");
+    printf("  --tensor-type-file tensor_type.txt: list of tensors to quantize to specific ggml_type. example: --tensor-type-file tensor_type_list.txt\n");
+    printf("      Advanced option to selectively quantize a long list of tensors. Format to be tensor_name=ggml_type, separated by spaces/newline.\n");
      printf("  --prune-layers L0,L1,L2...comma-separated list of layer numbers to prune from the model\n");
      printf("      Advanced option to remove all tensors from the given layers\n");
      printf("  --keep-split: will generate quantized model in the same shards as input\n");
@@ -415,6 +417,23 @@ static bool parse_tensor_type(const char * data, std::vector<tensor_quantization
      return true;
  }
  
+static bool parse_tensor_type_file(const char * filename, std::vector<tensor_quantization> & tensor_type) {
+    std::ifstream file(filename);
+    if (!file) {
+        printf("\n%s: failed to open file '%s': %s\n\n", __func__, filename, std::strerror(errno));
+        return false;
+    }
+
+    std::string arg;
+    while (file >> arg) {
+        if (!parse_tensor_type(arg.c_str(), tensor_type)) {
+            return false;
+        }
+    }
+
+    return true;
+}
+
  static bool parse_layer_prune(const char * data, std::vector<int> & prune_layers) {
      if (!data) {
          printf("\n%s: no layer pruning ids provided\n\n", __func__);
@@ -480,6 +499,10 @@ int main(int argc, char ** argv) {
              if (arg_idx == argc-1 || !parse_tensor_type(argv[++arg_idx], tensor_types)) {
                  usage(argv[0]);
              }
+        } else if (strcmp(argv[arg_idx], "--tensor-type-file") == 0) {
+            if (arg_idx == argc-1 || !parse_tensor_type_file(argv[++arg_idx], tensor_types)) {
+                usage(argv[0]);
+            }
          } else if (strcmp(argv[arg_idx], "--prune-layers") == 0) {
              if (arg_idx == argc-1 || !parse_layer_prune(argv[++arg_idx], prune_layers)) {
                  usage(argv[0]);
@@ -686,3 +709,4 @@ int main(int argc, char ** argv) {
  
      return 0;
  }
+
author	EugeoSynthesisThirtyTwo <redacted>
	Sat, 31 Jan 2026 03:39:21 +0000 (04:39 +0100)
committer	GitHub <redacted>
	Sat, 31 Jan 2026 03:39:21 +0000 (11:39 +0800)