printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
+ printf(" --keep-split: will generate quatized model in the same shards as input");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
} else {
usage(argv[0]);
}
+ } else if (strcmp(argv[arg_idx], "--keep-split")) {
+ params.keep_split = true;
} else {
usage(argv[0]);
}
std::string fname_out;
std::string ftype_str;
+ std::string suffix = ".gguf";
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
std::string fpath;
const size_t pos = fname_inp.find_last_of("/\\");
if (pos != std::string::npos) {
fpath = fname_inp.substr(0, pos + 1);
}
- // export as [inp path]/ggml-model-[ftype].gguf
- fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
+
+ // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
+ fname_out = fpath + "ggml-model-" + ftype_str;
+ if (!params.keep_split) {
+ fname_out += suffix;
+ }
arg_idx++;
if (ftype_str == "COPY") {
params.only_copy = true;
}
} else {
fname_out = argv[arg_idx];
+ if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
+ fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
+ }
arg_idx++;
if (argc <= arg_idx) {
--- /dev/null
+#!/bin/bash
+
+set -eu
+
+if [ $# -lt 1 ]
+then
+ echo "usage: $0 path_to_build_binary [path_to_temp_folder]"
+ echo "example: $0 ../../build/bin ../../tmp"
+ exit 1
+fi
+
+if [ $# -gt 1 ]
+then
+ TMP_DIR=$2
+else
+ TMP_DIR=/tmp
+fi
+
+set -x
+
+SPLIT=$1/gguf-split
+QUANTIZE=$1/quantize
+MAIN=$1/main
+WORK_PATH=$TMP_DIR/quantize
+CUR_DIR=$(pwd)
+
+mkdir -p "$WORK_PATH"
+
+# Clean up in case of previously failed test
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
+
+# 1. Get a model
+(
+ cd $WORK_PATH
+ "$CUR_DIR"/../../scripts/hf.sh --repo ggml-org/gemma-1.1-2b-it-Q8_0-GGUF --file gemma-1.1-2b-it.Q8_0.gguf
+)
+echo PASS
+
+# 2. Split model
+$SPLIT --split-max-tensors 28 $WORK_PATH/gemma-1.1-2b-it.Q8_0.gguf $WORK_PATH/ggml-model-split
+echo PASS
+echo
+
+# 3. Requant model with '--keep_split'
+$QUANTIZE --allow-requantize --keep_split $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant.gguf Q4_K
+echo PASS
+echo
+
+# 3a. Test the requanted model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-requant-00001-of-00006.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# 4. Requant mode without '--keep_split'
+$QUANTIZE --allow-requantize $WORK_PATH/ggml-model-split-00001-of-00006.gguf $WORK_PATH/ggml-model-requant-merge.gguf Q4_K
+echo PASS
+echo
+
+# 4b. Test the requanted model is loading properly
+$MAIN --model $WORK_PATH/ggml-model-requant-merge.gguf --random-prompt --n-predict 32
+echo PASS
+echo
+
+# Clean up
+rm -f $WORK_PATH/ggml-model-split*.gguf $WORK_PATH/ggml-model-requant*.gguf
return nullptr;
}
+ const llama_tensor_weight * get_weight(int i) const {
+ return get_weight(get_tensor_name(i));
+ }
+
const llama_tensor_weight & require_weight(const char * name) const {
const llama_tensor_weight * weight = get_weight(name);
if (!weight) {
std::vector<no_init<uint8_t>> work;
std::vector<no_init<float>> f32_conv_buf;
+ uint16_t n_split = 1;
+ // Assume split index is continuous
+ if (params->keep_split) {
+ for (int i = 0; i < ml.n_tensors; ++i) {
+ n_split = std::max(uint16_t(ml.get_weight(i)->idx+1), n_split);
+ }
+ }
+ std::vector<gguf_context*> ctx_outs(n_split, NULL);
+ ctx_outs[0] = ctx_out;
+
// populate the original tensors so we get an initial meta data
for (int i = 0; i < ml.n_tensors; ++i) {
- const struct ggml_tensor * meta = ml.get_tensor_meta(i);
- gguf_add_tensor(ctx_out, meta);
+ auto weight = ml.get_weight(i);
+ uint16_t i_split = params->keep_split ? weight->idx : 0;
+ struct ggml_tensor * tensor = weight->tensor;
+ if (ctx_outs[i_split] == NULL) {
+ ctx_outs[i_split] = gguf_init_empty();
+ }
+ gguf_add_tensor(ctx_outs[i_split], tensor);
}
- std::ofstream fout(fname_out, std::ios::binary);
- fout.exceptions(std::ofstream::failbit); // fail fast on write errors
-
- const size_t meta_size = gguf_get_meta_size(ctx_out);
+ // Set split info if needed
+ if (n_split > 1) {
+ for (size_t i = 0; i < ctx_outs.size(); ++i) {
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_NO).c_str(), i);
+ gguf_set_val_u16(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_COUNT).c_str(), n_split);
+ gguf_set_val_i32(ctx_outs[i], ml.llm_kv(LLM_KV_SPLIT_TENSORS_COUNT).c_str(), ml.n_tensors);
+ }
+ }
- LLAMA_LOG_INFO("%s: meta size = %zu bytes\n", __func__, meta_size);
+ int cur_split = -1;
+ std::ofstream fout;
+ auto close_ofstream = [&]() {
+ // Write metadata and close file handler
+ if (fout.is_open()) {
+ fout.seekp(0);
+ std::vector<uint8_t> data(gguf_get_meta_size(ctx_outs[cur_split]));
+ gguf_get_meta_data(ctx_outs[cur_split], data.data());
+ fout.write((const char *) data.data(), data.size());
+ fout.close();
+ }
+ };
+ auto new_ofstream = [&](int index = 0) {
+ cur_split = index;
+ GGML_ASSERT(ctx_outs[cur_split] && "Find uninitialized gguf_context");
+ std::string fname = fname_out;
+ if (params->keep_split) {
+ char split_path[PATH_MAX] = {0};
+ llama_split_path(split_path, sizeof(split_path), fname_out.c_str(), cur_split, n_split);
+ fname = std::string(split_path);
+ }
- // placeholder for the meta data
- ::zeros(fout, meta_size);
+ fout = std::ofstream(fname, std::ios::binary);
+ fout.exceptions(std::ofstream::failbit); // fail fast on write errors
+ const size_t meta_size = gguf_get_meta_size(ctx_outs[cur_split]);
+ // placeholder for the meta data
+ ::zeros(fout, meta_size);
+ };
const auto tn = LLM_TN(model.arch);
-
+ new_ofstream();
for (int i = 0; i < ml.n_tensors; ++i) {
- struct ggml_tensor * tensor = ml.get_tensor_meta(i);
+ auto weight = ml.get_weight(i);
+ struct ggml_tensor * tensor = weight->tensor;
+ if (weight->idx != cur_split && params->keep_split) {
+ close_ofstream();
+ new_ofstream(weight->idx);
+ }
const std::string name = ggml_get_name(tensor);
total_size_new += new_size;
// update the gguf meta data as we go
- gguf_set_tensor_type(ctx_out, name.c_str(), new_type);
- gguf_set_tensor_data(ctx_out, name.c_str(), new_data, new_size);
+ gguf_set_tensor_type(ctx_outs[cur_split], name.c_str(), new_type);
+ gguf_set_tensor_data(ctx_outs[cur_split], name.c_str(), new_data, new_size);
// write tensor data + padding
fout.write((const char *) new_data, new_size);
zeros(fout, GGML_PAD(new_size, align) - new_size);
}
-
- // go back to beginning of file and write the updated meta data
- {
- fout.seekp(0);
- std::vector<uint8_t> data(gguf_get_meta_size(ctx_out));
- gguf_get_meta_data(ctx_out, data.data());
- fout.write((const char *) data.data(), data.size());
+ close_ofstream();
+ for (auto & c:ctx_outs) {
+ gguf_free(c);
}
- fout.close();
-
- gguf_free(ctx_out);
-
LLAMA_LOG_INFO("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
LLAMA_LOG_INFO("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
/*.quantize_output_tensor =*/ true,
/*.only_copy =*/ false,
/*.pure =*/ false,
+ /*.keep_split =*/ false,
/*.imatrix =*/ nullptr,
/*.kv_overrides =*/ nullptr,
};