//
[[noreturn]]
static void usage(const char * executable) {
- printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+ printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights] [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--override-kv] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
printf(" --pure: Disable k-quant mixtures and quantize all tensors to the same type\n");
printf(" --imatrix file_name: use data in file_name as importance matrix for quant optimizations\n");
printf(" --include-weights tensor_name: use importance matrix for this/these tensor(s)\n");
printf(" --exclude-weights tensor_name: use importance matrix for this/these tensor(s)\n");
+ printf(" --output-tensor-type ggml_type: use this ggml_type for the output.weight tensor\n");
+ printf(" --token-embedding-type ggml_type: use this ggml_type for the token embeddings tensor\n");
+ printf(" --override-kv KEY=TYPE:VALUE\n");
+ printf(" Advanced option to override model metadata by key in the quantized model. May be specified multiple times.\n");
printf("Note: --include-weights and --exclude-weights cannot be used together\n");
printf("\nAllowed quantization types:\n");
for (auto & it : QUANT_OPTIONS) {
exit(1);
}
-static void load_imatrix(const std::string& imatrix_file, std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
+static void load_imatrix(const std::string & imatrix_file, std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
std::ifstream in(imatrix_file.c_str(), std::ios::binary);
if (!in) {
- printf("%s: failed to open %s\n",__func__,imatrix_file.c_str());
+ printf("%s: failed to open %s\n",__func__, imatrix_file.c_str());
return;
}
int n_entries;
- in.read((char*)&n_entries, sizeof(n_entries));
+ in.read((char *)&n_entries, sizeof(n_entries));
if (in.fail() || n_entries < 1) {
printf("%s: no data in file %s\n", __func__, imatrix_file.c_str());
return;
std::vector<char> name_as_vec(len+1);
in.read((char *)name_as_vec.data(), len);
if (in.fail()) {
- printf("%s: failed reading name for entry %d from %s\n",__func__,i+1,imatrix_file.c_str());
+ printf("%s: failed reading name for entry %d from %s\n", __func__, i+1, imatrix_file.c_str());
return;
}
name_as_vec[len] = 0;
std::string name{name_as_vec.data()};
- auto& e = imatrix_data[std::move(name)];
+ auto & e = imatrix_data[std::move(name)];
int ncall;
- in.read((char*)&ncall, sizeof(ncall));
+ in.read((char *)&ncall, sizeof(ncall));
int nval;
in.read((char *)&nval, sizeof(nval));
if (in.fail() || nval < 1) {
- printf("%s: failed reading number of values for entry %d\n",__func__,i);
+ printf("%s: failed reading number of values for entry %d\n", __func__, i);
imatrix_data = {};
return;
}
e.resize(nval);
- in.read((char*)e.data(), nval*sizeof(float));
+ in.read((char *)e.data(), nval*sizeof(float));
if (in.fail()) {
- printf("%s: failed reading data for entry %d\n",__func__,i);
+ printf("%s: failed reading data for entry %d\n", __func__, i);
imatrix_data = {};
return;
}
for (auto& v : e) v /= ncall;
}
}
- printf("%s: loaded %d importance matrix entries from %s\n",__func__,int(imatrix_data.size()),imatrix_file.c_str());
+ printf("%s: loaded %d importance matrix entries from %s\n", __func__, int(imatrix_data.size()), imatrix_file.c_str());
}
-static void prepare_imatrix(const std::string& imatrix_file,
- const std::vector<std::string>& included_weights,
- const std::vector<std::string>& excluded_weights,
- std::unordered_map<std::string, std::vector<float>>& imatrix_data) {
+static void prepare_imatrix(const std::string & imatrix_file,
+ const std::vector<std::string> & included_weights,
+ const std::vector<std::string> & excluded_weights,
+ std::unordered_map<std::string, std::vector<float>> & imatrix_data) {
if (!imatrix_file.empty()) {
load_imatrix(imatrix_file, imatrix_data);
}
return result;
}
+static bool parse_kv_override(const char * data, std::vector<llama_model_kv_override> & overrides) {
+ const char* sep = strchr(data, '=');
+ if (sep == nullptr || sep - data >= 128) {
+ fprintf(stderr, "%s: malformed KV override '%s'\n", __func__, data);
+ return false;
+ }
+ llama_model_kv_override kvo;
+ std::strncpy(kvo.key, data, sep - data);
+ kvo.key[sep - data] = 0;
+ sep++;
+ if (strncmp(sep, "int:", 4) == 0) {
+ sep += 4;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_INT;
+ kvo.int_value = std::atol(sep);
+ } else if (strncmp(sep, "float:", 6) == 0) {
+ sep += 6;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_FLOAT;
+ kvo.float_value = std::atof(sep);
+ } else if (strncmp(sep, "bool:", 5) == 0) {
+ sep += 5;
+ kvo.tag = LLAMA_KV_OVERRIDE_TYPE_BOOL;
+ if (std::strcmp(sep, "true") == 0) {
+ kvo.bool_value = true;
+ } else if (std::strcmp(sep, "false") == 0) {
+ kvo.bool_value = false;
+ } else {
+ fprintf(stderr, "%s: invalid boolean value for KV override '%s'\n", __func__, data);
+ return false;
+ }
+ } else {
+ fprintf(stderr, "%s: invalid type for KV override '%s'\n", __func__, data);
+ return false;
+ }
+ overrides.emplace_back(std::move(kvo));
+ return true;
+}
+
int main(int argc, char ** argv) {
if (argc < 3) {
usage(argv[0]);
int arg_idx = 1;
std::string imatrix_file;
std::vector<std::string> included_weights, excluded_weights;
+ std::vector<llama_model_kv_override> kv_overrides;
for (; arg_idx < argc && strncmp(argv[arg_idx], "--", 2) == 0; arg_idx++) {
if (strcmp(argv[arg_idx], "--leave-output-tensor") == 0) {
} else {
usage(argv[0]);
}
+ } else if (strcmp(argv[arg_idx], "--override-kv") == 0) {
+ if (arg_idx == argc-1 || !parse_kv_override(argv[++arg_idx], kv_overrides)) {
+ usage(argv[0]);
+ }
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
params.allow_requantize = true;
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
if (!imatrix_data.empty()) {
params.imatrix = &imatrix_data;
}
+ if (!kv_overrides.empty()) {
+ kv_overrides.emplace_back();
+ kv_overrides.back().key[0] = 0;
+ params.kv_overrides = &kv_overrides;
+ }
llama_backend_init();
if (ftype_str == "COPY") {
params.only_copy = true;
}
- }
- else {
+ } else {
fname_out = argv[arg_idx];
arg_idx++;
constexpr bool use_mmap = false;
#endif
- llama_model_loader ml(fname_inp, use_mmap, NULL);
+ llama_model_kv_override * kv_overrides = nullptr;
+ if (params->kv_overrides) {
+ auto v = (std::vector<llama_model_kv_override>*)params->kv_overrides;
+ kv_overrides = v->data();
+ }
+ llama_model_loader ml(fname_inp, use_mmap, kv_overrides);
ml.init_mappings(false); // no prefetching?
llama_model model;
gguf_set_val_u32(ctx_out, "general.quantization_version", GGML_QNT_VERSION);
gguf_set_val_u32(ctx_out, "general.file_type", ftype);
+ if (params->kv_overrides) {
+ const std::vector<llama_model_kv_override> & overrides = *(const std::vector<llama_model_kv_override> *)params->kv_overrides;
+ for (auto & o : overrides) {
+ if (o.key[0] == 0) break;
+ if (o.tag == LLAMA_KV_OVERRIDE_TYPE_FLOAT) {
+ gguf_set_val_f32(ctx_out, o.key, o.float_value);
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_INT) {
+ gguf_set_val_i32(ctx_out, o.key, o.int_value);
+ } else if (o.tag == LLAMA_KV_OVERRIDE_TYPE_BOOL) {
+ gguf_set_val_bool(ctx_out, o.key, o.bool_value);
+ } else {
+ LLAMA_LOG_WARN("%s: unknown KV override type for key %s\n", __func__, o.key);
+ }
+ }
+ }
+
for (int i = 0; i < ml.n_tensors; ++i) {
const struct ggml_tensor * meta = ml.get_tensor_meta(i);
// TODO: avoid hardcoded tensor names - use the TN_* constants
if (name.find("attn_v.weight") != std::string::npos || name.find("attn_qkv.weight") != std::string::npos) {
++qs.n_attention_wv;
- }
- else if (name.find("ffn_down") != std::string::npos) {
+ } else if (name.find("ffn_down") != std::string::npos) {
++qs.n_ffn_down;
- }
- else if (name.find("ffn_gate") != std::string::npos) {
+ } else if (name.find("ffn_gate") != std::string::npos) {
++qs.n_ffn_gate;
- }
- else if (name.find("ffn_up") != std::string::npos) {
+ } else if (name.find("ffn_up") != std::string::npos) {
++qs.n_ffn_up;
- }
- else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
+ } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
qs.has_output = true;
}
}
- if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t)qs.n_attention_wv != model.hparams.n_layer) {
+ if (qs.n_attention_wv != qs.n_ffn_down || (uint32_t) qs.n_attention_wv != model.hparams.n_layer) {
LLAMA_LOG_WARN("%s ============ Strange model: n_attention_wv = %d, n_ffn_down = %d, hparams.n_layer = %d\n",
__func__, qs.n_attention_wv, qs.n_ffn_down, model.hparams.n_layer);
}
/*.only_copy =*/ false,
/*.pure =*/ false,
/*.imatrix =*/ nullptr,
+ /*.kv_overrides =*/ nullptr,
};
return result;