quantize : add '--keep-split' to quantize model into shards (#6688)

author jiez <redacted>

Thu, 25 Apr 2024 10:29:35 +0000 (18:29 +0800)

committer GitHub <redacted>

Thu, 25 Apr 2024 10:29:35 +0000 (13:29 +0300)
author jiez <redacted>
Thu, 25 Apr 2024 10:29:35 +0000 (18:29 +0800)
committer GitHub <redacted>
Thu, 25 Apr 2024 10:29:35 +0000 (13:29 +0300)
diff --git a/examples/quantize/quantize.cpp b/examples/quantize/quantize.cpp

index 64cb6db19d0040ea6171d0c1039fc0731927370a..da1850dfdf517318ff0d99c8e4cba42389c501b7 100644 (file)
--- a/examples/quantize/quantize.cpp
+++ b/examples/quantize/quantize.cpp
@@ -97,6 +97,7 @@ static void usage(const char * executable) {
      printf("  --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
      printf("  --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
      printf("  --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
+    printf("  --keep-split: will generate quatized model in the same shards as input");
      printf("  --override-kv KEY=TYPE:VALUE\n");
      printf("      Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
      printf("Note: --include-weights and --exclude-weights cannot be used together\n");
@@ -300,6 +301,8 @@ int main(int argc, char ** argv) {
              } else {
                  usage(argv[0]);
              }
+        } else if (strcmp(argv[arg_idx], "--keep-split")) {
+            params.keep_split = true;
          } else {
              usage(argv[0]);
          }
@@ -332,20 +335,28 @@ int main(int argc, char ** argv) {
      std::string fname_out;
  
      std::string ftype_str;
+    std::string suffix = ".gguf";
      if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
          std::string fpath;
          const size_t pos = fname_inp.find_last_of("/\\");
          if (pos != std::string::npos) {
              fpath = fname_inp.substr(0, pos + 1);
          }
-        // export as [inp path]/ggml-model-[ftype].gguf
-        fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
+
+        // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
+        fname_out = fpath + "ggml-model-" + ftype_str;
+        if (!params.keep_split) {
+            fname_out += suffix;
+        }
          arg_idx++;
          if (ftype_str == "COPY") {
              params.only_copy = true;
          }
      } else {
          fname_out = argv[arg_idx];
+        if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
+            fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
+        }
          arg_idx++;
  
          if (argc <= arg_idx) {
diff --git a/examples/quantize/test.sh b/examples/quantize/test.sh

new file mode 100644 (file)

index 0000000..840f712
--- /dev/null
+++ b/examples/quantize/test.sh
@@ -0,0 +1,65 @@
+#!/bin/bash
+
+set -eu
+
+if [ $# -lt 1 ]
+then
+  echo "usage:   $0 path_to_build_binary [path_to_temp_folder]"
+  echo "example: $0 ../../build/bin ../../tmp"
+  exit 1
+fi
+
+if [ $# -gt 1 ]
+then
+  TMP_DIR=$2
+else
+  TMP_DIR=/tmp
+fi
+
+set -x
+
+SPLIT=$1/gguf-split
+QUANTIZE=$1/quantize
+MAIN=$1/main
+WORK_PATH=$TMP_DIR/quantize
+CUR_DIR=$(pwd)
+
+mkdir -p "$WORK_PATH"
+
+# Clean up in case of previously failed test
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
+
+# 1. Get a model
+(
+  cd $WORK_PATH
+  "$CUR_DIR"/../../scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
+)
+echo PASS
+
+# 2. Split model
+$SPLIT --split-max-tensors 28  $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
+echo PASS
+echo
+
+# 3. Requant model with '--keep_split'
+$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
+echo PASS
+echo
+
+# 3a. Test the requanted model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# 4. Requant mode without '--keep_split'
+$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
+echo PASS
+echo
+
+# 4b. Test the requanted model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# Clean up
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
diff --git a/llama.cpp b/llama.cpp

index 3a84b4916bd3080b4fa2eb73b274e79b68212f32..0f74cb7aad5f3b07926595a767c525c4ab65d820 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -3297,6 +3297,10 @@ struct llama_model_loader {
          return nullptr;
      }
  
+    const llama_tensor_weight * get_weight(int i) const {
+        return get_weight(get_tensor_name(i));
+    }
+
      const llama_tensor_weight & require_weight(const char * name) const {
          const llama_tensor_weight * weight = get_weight(name);
          if (!weight) {
@@ -14528,26 +14532,74 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
      std::vector<no_init<uint8_t>> work;
      std::vector<no_init<float>> f32_conv_buf;
  
+    uint16_t n_split = 1;
+    // Assume split index is continuous
+    if (params->keep_split) {
+        for (int i = 0; i < ml.n_tensors; ++i) {
+            n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
+        }
+    }
+    std::vector<gguf_context*> ctx_outs(n_split, NULL);
+    ctx_outs[0] = ctx_out;
+
      // populate the original tensors so we get an initial meta data
      for (int i = 0; i < ml.n_tensors; ++i) {
-        const struct ggml_tensor * meta = ml.get_tensor_meta(i);
-        gguf_add_tensor(ctx_out, meta);
+        auto weight = ml.get_weight(i);
+        uint16_t i_split = params->keep_split ? weight->idx : 0;
+        struct ggml_tensor * tensor = weight->tensor;
+        if (ctx_outs[i_split] == NULL) {
+            ctx_outs[i_split] = gguf_init_empty();
+        }
+        gguf_add_tensor(ctx_outs[i_split], tensor);
      }
  
-    std::ofstream fout(fname_out, std::ios::binary);
-    fout.exceptions(std::ofstream::failbit); // fail fast on write errors
-
-    const size_t meta_size = gguf_get_meta_size(ctx_out);
+    // Set split info if needed
+    if (n_split > 1) {
+        for (size_t i = 0; i < ctx_outs.size(); ++i) {
+            gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
+            gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
+            gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
+        }
+    }
  
-    LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
+    int cur_split = -1;
+    std::ofstream fout;
+    auto close_ofstream = [&]() {
+        // Write metadata and close file handler
+        if (fout.is_open()) {
+            fout.seekp(0);
+            std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
+            gguf_get_meta_data(ctx_outs[cur_split], data.data());
+            fout.write((const char *) data.data(), data.size());
+            fout.close();
+        }
+    };
+    auto new_ofstream = [&](int index = 0) {
+        cur_split = index;
+        GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
+        std::string fname = fname_out;
+        if (params->keep_split) {
+            char split_path[PATH_MAX] = {0};
+            llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
+            fname = std::string(split_path);
+        }
  
-    // placeholder for the meta data
-    ::zeros(fout, meta_size);
+        fout = std::ofstream(fname, std::ios::binary);
+        fout.exceptions(std::ofstream::failbit); // fail fast on write errors
+        const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
+        // placeholder for the meta data
+        ::zeros(fout, meta_size);
+    };
  
      const auto tn = LLM_TN(model.arch);
-
+    new_ofstream();
      for (int i = 0; i < ml.n_tensors; ++i) {
-        struct ggml_tensor * tensor = ml.get_tensor_meta(i);
+        auto weight = ml.get_weight(i);
+        struct ggml_tensor * tensor = weight->tensor;
+        if (weight->idx != cur_split && params->keep_split) {
+            close_ofstream();
+            new_ofstream(weight->idx);
+        }
  
          const std::string name = ggml_get_name(tensor);
  
@@ -14702,26 +14754,18 @@ static void llama_model_quantize_internal(const std::string & fname_inp, const s
          total_size_new += new_size;
  
          // update the gguf meta data as we go
-        gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
-        gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
+        gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
+        gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
  
          // write tensor data + padding
          fout.write((const char *) new_data, new_size);
          zeros(fout, GGML_PAD(new_size, align) - new_size);
      }
-
-    // go back to beginning of file and write the updated meta data
-    {
-        fout.seekp(0);
-        std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
-        gguf_get_meta_data(ctx_out, data.data());
-        fout.write((const char *) data.data(), data.size());
+    close_ofstream();
+    for (auto & c:ctx_outs) {
+        gguf_free(c);
      }
  
-    fout.close();
-
-    gguf_free(ctx_out);
-
      LLAMA_LOG_INFO("%s: model size  = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
      LLAMA_LOG_INFO("%s: quant size  = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
  
@@ -15077,6 +15121,7 @@ struct llama_model_quantize_params llama_model_quantize_default_params() {
          /*.quantize_output_tensor      =*/ true,
          /*.only_copy                   =*/ false,
          /*.pure                        =*/ false,
+        /*.keep_split                  =*/ false,
          /*.imatrix                     =*/ nullptr,
          /*.kv_overrides                =*/ nullptr,
      };
diff --git a/llama.h b/llama.h

index 0eb2a1e9ab0a24a4866ee683b0e3bd9bb452f97d..8aa763672be1b864536dbc122f6729eae01dc92a 100644 (file)
--- a/llama.h
+++ b/llama.h
@@ -288,6 +288,7 @@ extern "C" {
          bool quantize_output_tensor;         // quantize output.weight
          bool only_copy;                      // only copy tensors - ftype, allow_requantize and quantize_output_tensor are ignored
          bool pure;                           // quantize all tensors to the default type
+        bool keep_split;                     // quantize to the same number of shards
          void * imatrix;                      // pointer to importance matrix data
          void * kv_overrides;                 // pointer to vector containing overrides
      } llama_model_quantize_params;
author	jiez <redacted>
	Thu, 25 Apr 2024 10:29:35 +0000 (18:29 +0800)
committer	GitHub <redacted>
	Thu, 25 Apr 2024 10:29:35 +0000 (13:29 +0300)
examples/quantize/quantize.cpp		patch \| blob \| history
examples/quantize/test.sh	[new file with mode: 0644]	patch \| blob
llama.cpp		patch \| blob \| history
llama.h		patch \| blob \| history