return new_size;
}
+static bool tensor_type_requires_imatrix(const ggml_tensor * t, const ggml_type dst_type, const llama_ftype ftype) {
+ return (
+ dst_type == GGML_TYPE_IQ2_XXS || dst_type == GGML_TYPE_IQ2_XS ||
+ dst_type == GGML_TYPE_IQ3_XXS || dst_type == GGML_TYPE_IQ1_S ||
+ dst_type == GGML_TYPE_IQ2_S || dst_type == GGML_TYPE_IQ1_M ||
+ ( // Q2_K_S is the worst k-quant type - only allow it without imatrix for token embeddings
+ dst_type == GGML_TYPE_Q2_K && ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(t->name, "token_embd.weight") != 0
+ )
+ );
+}
+
static void llama_model_quantize_impl(const std::string & fname_inp, const std::string & fname_out, const llama_model_quantize_params * params) {
ggml_type default_type;
llama_ftype ftype = params->ftype;
};
const auto tn = LLM_TN(model.arch);
- new_ofstream(0);
+
+ // no output file for --dry-run
+ if (!params->dry_run) {
+ new_ofstream(0);
+ }
+
+ // flag for `--dry-run`, to let the user know if imatrix will be required for a real
+ // quantization, as a courtesy
+ bool will_require_imatrix = false;
+
for (const auto * it : tensors) {
const auto & weight = *it;
ggml_tensor * tensor = weight.tensor;
- if (weight.idx != cur_split && params->keep_split) {
+ if (!params->dry_run && (weight.idx != cur_split && params->keep_split)) {
close_ofstream();
new_ofstream(weight.idx);
}
const std::string name = ggml_get_name(tensor);
+ const size_t tensor_size = ggml_nbytes(tensor);
- if (!ml.use_mmap) {
- if (read_data.size() < ggml_nbytes(tensor)) {
- read_data.resize(ggml_nbytes(tensor));
+ if (!params->dry_run) {
+ if (!ml.use_mmap) {
+ if (read_data.size() < tensor_size) {
+ read_data.resize(tensor_size);
+ }
+ tensor->data = read_data.data();
}
- tensor->data = read_data.data();
+ ml.load_data_for(tensor);
}
- ml.load_data_for(tensor);
LLAMA_LOG_INFO("[%4d/%4d] %36s - [%s], type = %6s, ",
++idx, ml.n_tensors,
quantize = tensor->type != new_type;
}
- if (!quantize) {
- new_type = tensor->type;
- new_data = tensor->data;
- new_size = ggml_nbytes(tensor);
- LLAMA_LOG_INFO("size = %8.3f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0);
+ // we have now decided on the target type for this tensor
+ if (params->dry_run) {
+ // the --dry-run option calculates the final quantization size without quantizting
+ if (quantize) {
+ new_size = ggml_nrows(tensor) * ggml_row_size(new_type, tensor->ne[0]);
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB (%s)\n",
+ tensor_size/1024.0/1024.0,
+ new_size/1024.0/1024.0,
+ ggml_type_name(new_type));
+ if (!will_require_imatrix && tensor_type_requires_imatrix(tensor, new_type, params->ftype)) {
+ will_require_imatrix = true;
+ }
+ } else {
+ new_size = tensor_size;
+ LLAMA_LOG_INFO("size = %8.3f MiB\n", new_size/1024.0/1024.0);
+ }
+ total_size_org += tensor_size;
+ total_size_new += new_size;
+ continue;
} else {
- const int64_t nelements = ggml_nelements(tensor);
+ // no --dry-run, perform quantization
+ if (!quantize) {
+ new_type = tensor->type;
+ new_data = tensor->data;
+ new_size = tensor_size;
+ LLAMA_LOG_INFO("size = %8.3f MiB\n", tensor_size/1024.0/1024.0);
+ } else {
+ const int64_t nelements = ggml_nelements(tensor);
- const float * imatrix = nullptr;
- if (imatrix_data) {
- auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
- if (it == imatrix_data->end()) {
- LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
- } else {
- if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
- imatrix = it->second.data();
+ const float * imatrix = nullptr;
+ if (imatrix_data) {
+ auto it = imatrix_data->find(remap_imatrix(tensor->name, mapped));
+ if (it == imatrix_data->end()) {
+ LLAMA_LOG_INFO("\n====== %s: did not find weights for %s\n", __func__, tensor->name);
} else {
- LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
- int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
-
- // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
- // this is a significant error and it may be good idea to abort the process if this happens,
- // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
- // tok_embd should be ignored in this case, since it always causes this warning
- if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
- throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
- int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
+ if (it->second.size() == (size_t)tensor->ne[0]*tensor->ne[2]) {
+ imatrix = it->second.data();
+ } else {
+ LLAMA_LOG_INFO("\n====== %s: imatrix size %d is different from tensor size %d for %s\n", __func__,
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name);
+
+ // this can happen when quantizing an old mixtral model with split tensors with a new incompatible imatrix
+ // this is a significant error and it may be good idea to abort the process if this happens,
+ // since many people will miss the error and not realize that most of the model is being quantized without an imatrix
+ // tok_embd should be ignored in this case, since it always causes this warning
+ if (name != tn(LLM_TENSOR_TOKEN_EMBD, "weight")) {
+ throw std::runtime_error(format("imatrix size %d is different from tensor size %d for %s",
+ int(it->second.size()), int(tensor->ne[0]*tensor->ne[2]), tensor->name));
+ }
}
}
}
- }
- if ((new_type == GGML_TYPE_IQ2_XXS ||
- new_type == GGML_TYPE_IQ2_XS ||
- new_type == GGML_TYPE_IQ2_S ||
- new_type == GGML_TYPE_IQ1_S ||
- (new_type == GGML_TYPE_IQ1_M && strcmp(tensor->name, "token_embd.weight") && strcmp(tensor->name, "output.weight")) ||
- (new_type == GGML_TYPE_Q2_K && params->ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S && strcmp(tensor->name, "token_embd.weight") != 0)) && !imatrix) {
- LLAMA_LOG_ERROR("\n\n============================================================\n");
- LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
- LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
- LLAMA_LOG_ERROR("============================================================\n\n");
- throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
- }
+ if (!imatrix && tensor_type_requires_imatrix(tensor, new_type, params->ftype)) {
+ LLAMA_LOG_ERROR("\n\n============================================================\n");
+ LLAMA_LOG_ERROR("Missing importance matrix for tensor %s in a very low-bit quantization\n", tensor->name);
+ LLAMA_LOG_ERROR("The result will be garbage, so bailing out\n");
+ LLAMA_LOG_ERROR("============================================================\n\n");
+ throw std::runtime_error(format("Missing importance matrix for tensor %s in a very low-bit quantization", tensor->name));
+ }
- float * f32_data;
+ float * f32_data;
- if (tensor->type == GGML_TYPE_F32) {
- f32_data = (float *) tensor->data;
- } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
- throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
- } else {
- llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
- f32_data = (float *) f32_conv_buf.data();
- }
+ if (tensor->type == GGML_TYPE_F32) {
+ f32_data = (float *) tensor->data;
+ } else if (ggml_is_quantized(tensor->type) && !params->allow_requantize) {
+ throw std::runtime_error(format("requantizing from type %s is disabled", ggml_type_name(tensor->type)));
+ } else {
+ llama_tensor_dequantize_impl(tensor, f32_conv_buf, workers, nelements, nthread);
+ f32_data = (float *) f32_conv_buf.data();
+ }
- LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
- fflush(stdout);
+ LLAMA_LOG_INFO("converting to %s .. ", ggml_type_name(new_type));
+ fflush(stdout);
- if (work.size() < (size_t)nelements * 4) {
- work.resize(nelements * 4); // upper bound on size
- }
- new_data = work.data();
+ if (work.size() < (size_t)nelements * 4) {
+ work.resize(nelements * 4); // upper bound on size
+ }
+ new_data = work.data();
- const int64_t n_per_row = tensor->ne[0];
- const int64_t nrows = tensor->ne[1];
+ const int64_t n_per_row = tensor->ne[0];
+ const int64_t nrows = tensor->ne[1];
- static const int64_t min_chunk_size = 32 * 512;
- const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
+ static const int64_t min_chunk_size = 32 * 512;
+ const int64_t chunk_size = (n_per_row >= min_chunk_size ? n_per_row : n_per_row * ((min_chunk_size + n_per_row - 1)/n_per_row));
- const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
- const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
- const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
+ const int64_t nelements_matrix = tensor->ne[0] * tensor->ne[1];
+ const int64_t nchunk = (nelements_matrix + chunk_size - 1)/chunk_size;
+ const int64_t nthread_use = nthread > 1 ? std::max((int64_t)1, std::min((int64_t)nthread, nchunk)) : 1;
- // quantize each expert separately since they have different importance matrices
- new_size = 0;
- for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
- const float * f32_data_03 = f32_data + i03 * nelements_matrix;
- void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
- const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
+ // quantize each expert separately since they have different importance matrices
+ new_size = 0;
+ for (int64_t i03 = 0; i03 < tensor->ne[2]; ++i03) {
+ const float * f32_data_03 = f32_data + i03 * nelements_matrix;
+ void * new_data_03 = (char *)new_data + ggml_row_size(new_type, n_per_row) * i03 * nrows;
+ const float * imatrix_03 = imatrix ? imatrix + i03 * n_per_row : nullptr;
- new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
+ new_size += llama_tensor_quantize_impl(new_type, f32_data_03, new_data_03, chunk_size, nrows, n_per_row, imatrix_03, workers, nthread_use);
- // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
+ // TODO: temporary sanity check that the F16 -> MXFP4 is lossless
#if 0
- if (new_type == GGML_TYPE_MXFP4) {
- auto * x = f32_data_03;
-
- //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
- std::vector<float> deq(nrows*n_per_row);
- const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
- qtype->to_float(new_data_03, deq.data(), deq.size());
-
- double err = 0.0f;
- for (int i = 0; i < (int) deq.size(); ++i) {
- err += fabsf(deq[i] - x[i]);
- //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
- if (deq[i] != x[i]) {
- LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
+ if (new_type == GGML_TYPE_MXFP4) {
+ auto * x = f32_data_03;
+
+ //LLAMA_LOG_INFO("nrows = %d, n_per_row = %d\n", nrows, n_per_row);
+ std::vector<float> deq(nrows*n_per_row);
+ const ggml_type_traits * qtype = ggml_get_type_traits(new_type);
+ qtype->to_float(new_data_03, deq.data(), deq.size());
+
+ double err = 0.0f;
+ for (int i = 0; i < (int) deq.size(); ++i) {
+ err += fabsf(deq[i] - x[i]);
+ //if (fabsf(deq[i] - x[i]) > 0.00001 && i < 256) {
+ if (deq[i] != x[i]) {
+ LLAMA_LOG_INFO("deq[%d] = %f, x[%d] = %f\n", i, deq[i], i, x[i]);
+ }
}
+ //LLAMA_LOG_INFO("err = %f\n", err);
+ GGML_ASSERT(err == 0.00000);
}
- //LLAMA_LOG_INFO("err = %f\n", err);
- GGML_ASSERT(err == 0.00000);
- }
#endif
+ }
+ LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", tensor_size/1024.0/1024.0, new_size/1024.0/1024.0);
}
- LLAMA_LOG_INFO("size = %8.2f MiB -> %8.2f MiB\n", ggml_nbytes(tensor)/1024.0/1024.0, new_size/1024.0/1024.0);
- }
- total_size_org += ggml_nbytes(tensor);
- total_size_new += new_size;
+ total_size_org += tensor_size;
+ total_size_new += new_size;
+
+ // update the gguf meta data as we go
+ gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
+ GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
+ gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
+
+ // write tensor data + padding
+ fout.write((const char *) new_data, new_size);
+ zeros(fout, GGML_PAD(new_size, align) - new_size);
+ } // no --dry-run
+ } // iterate over tensors
+
+ if (!params->dry_run) {
+ close_ofstream();
+ }
- // update the gguf meta data as we go
- gguf_set_tensor_type(ctx_outs[cur_split].get(), name.c_str(), new_type);
- GGML_ASSERT(gguf_get_tensor_size(ctx_outs[cur_split].get(), gguf_find_tensor(ctx_outs[cur_split].get(), name.c_str())) == new_size);
- gguf_set_tensor_data(ctx_outs[cur_split].get(), name.c_str(), new_data);
+ LLAMA_LOG_INFO("%s: model size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_org/1024.0/1024.0, total_size_org*8.0/ml.n_elements);
+ LLAMA_LOG_INFO("%s: quant size = %8.2f MiB (%.2f BPW)\n", __func__, total_size_new/1024.0/1024.0, total_size_new*8.0/ml.n_elements);
- // write tensor data + padding
- fout.write((const char *) new_data, new_size);
- zeros(fout, GGML_PAD(new_size, align) - new_size);
+ if (!params->imatrix && params->dry_run && will_require_imatrix) {
+ LLAMA_LOG_WARN("%s: WARNING: dry run completed successfully, but actually completing this quantization will require an imatrix!\n",
+ __func__
+ );
}
- close_ofstream();
-
- LLAMA_LOG_INFO("%s: model size = %8.2f MiB\n", __func__, total_size_org/1024.0/1024.0);
- LLAMA_LOG_INFO("%s: quant size = %8.2f MiB\n", __func__, total_size_new/1024.0/1024.0);
if (qs.n_fallback > 0) {
LLAMA_LOG_WARN("%s: WARNING: %d of %d tensor(s) required fallback quantization\n",
/*.only_copy =*/ false,
/*.pure =*/ false,
/*.keep_split =*/ false,
+ /*.dry_run =*/ false,
/*.imatrix =*/ nullptr,
/*.kv_overrides =*/ nullptr,
/*.tensor_type =*/ nullptr,
static void usage(const char * executable) {
printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] [--pure] [--imatrix] [--include-weights]\n", executable);
printf(" [--exclude-weights] [--output-tensor-type] [--token-embedding-type] [--tensor-type] [--tensor-type-file]\n");
- printf(" [--prune-layers] [--keep-split] [--override-kv]\n");
+ printf(" [--prune-layers] [--keep-split] [--override-kv] [--dry-run]\n");
printf(" model-f32.gguf [model-quant.gguf] type [nthreads]\n\n");
printf(" --allow-requantize\n");
printf(" allow requantizing tensors that have already been quantized\n");
printf(" generate quantized model in the same shards as input\n");
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" override model metadata by key in the quantized model. may be specified multiple times.\n");
- printf(" WARNING: this is an advanced option, use with care.\n\n");
+ printf(" WARNING: this is an advanced option, use with care.\n");
+ printf(" --dry-run\n");
+ printf(" calculate and show the final quantization size without performing quantization\n");
+ printf(" example: llama-quantize --dry-run model-f32.gguf Q4_K\n\n");
printf("note: --include-weights and --exclude-weights cannot be used together\n\n");
printf("-----------------------------------------------------------------------------\n");
printf(" allowed quantization types\n");
if (arg_idx == argc-1 || !string_parse_kv_override(argv[++arg_idx], kv_overrides)) {
usage(argv[0]);
}
+ } else if (strcmp(argv[arg_idx], "--dry-run") == 0) {
+ params.dry_run = true;
} else if (strcmp(argv[arg_idx], "--allow-requantize") == 0) {
params.allow_requantize = true;
} else if (strcmp(argv[arg_idx], "--pure") == 0) {
std::string ftype_str;
std::string suffix = ".gguf";
if (try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
- std::string fpath;
- const size_t pos = fname_inp.find_last_of("/\\");
- if (pos != std::string::npos) {
- fpath = fname_inp.substr(0, pos + 1);
- }
+ // argv[arg_idx] is the ftype directly: <input> <ftype>
+ if (!params.dry_run) {
+ std::string fpath;
+ const size_t pos = fname_inp.find_last_of("/\\");
+ if (pos != std::string::npos) {
+ fpath = fname_inp.substr(0, pos + 1);
+ }
- // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
- fname_out = fpath + "ggml-model-" + ftype_str;
- if (!params.keep_split) {
- fname_out += suffix;
+ // export as [inp path]/ggml-model-[ftype]. Only add extension if there is no splitting
+ fname_out = fpath + "ggml-model-" + ftype_str;
+ if (!params.keep_split) {
+ fname_out += suffix;
+ }
}
arg_idx++;
if (ftype_str == "COPY") {
params.only_copy = true;
}
} else {
+ // argv[arg_idx] is not a valid ftype, so treat it as output path: <input> <output> <ftype>
fname_out = argv[arg_idx];
if (params.keep_split && fname_out.find(suffix) != std::string::npos) {
fname_out = fname_out.substr(0, fname_out.length() - suffix.length());
}
}
- if ((params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
- params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S ||
- params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
- params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S ||
- params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M) && imatrix_data.empty()) {
+ if (!params.dry_run &&
+ (
+ params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XS || params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_XXS ||
+ params.ftype == LLAMA_FTYPE_MOSTLY_IQ2_S || params.ftype == LLAMA_FTYPE_MOSTLY_Q2_K_S ||
+ params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_S || params.ftype == LLAMA_FTYPE_MOSTLY_IQ1_M
+ ) && imatrix_data.empty()) {
fprintf(stderr, "\n==========================================================================================================\n");
fprintf(stderr, "Please do not use IQ1_S, IQ1_M, IQ2_S, IQ2_XXS, IQ2_XS or Q2_K_S quantization without an importance matrix\n");
fprintf(stderr, "==========================================================================================================\n\n\n");
return 1;
}
- if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
- fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
- return 1;
+ if (!params.dry_run) {
+ if (std::error_code ec; std::filesystem::equivalent(fname_inp, fname_out, ec)) {
+ fprintf(stderr, "%s: error: input and output files are the same: '%s'\n", __func__, fname_inp.c_str());
+ return 1;
+ }
}
print_build_info();
- fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
+ if (params.dry_run) {
+ fprintf(stderr, "%s: calculating quantization size for '%s' as %s", __func__, fname_inp.c_str(), ftype_str.c_str());
+ } else {
+ fprintf(stderr, "%s: quantizing '%s' to '%s' as %s", __func__, fname_inp.c_str(), fname_out.c_str(), ftype_str.c_str());
+ }
+
if (params.nthread > 0) {
fprintf(stderr, " using %d threads", params.nthread);
}