{ "Q8_0", LLAMA_FTYPE_MOSTLY_Q8_0, " 6.70G, +0.0004 ppl @ LLaMA-v1-7B", },
{ "F16", LLAMA_FTYPE_MOSTLY_F16, "13.00G @ 7B", },
{ "F32", LLAMA_FTYPE_ALL_F32, "26.00G @ 7B", },
+ // Note: Ensure COPY comes after F32 to avoid ftype 0 from matching.
+ { "COPY", LLAMA_FTYPE_ALL_F32, "only copy tensors, no quantizing", },
};
// ./quantize [--allow-requantize] [--leave-output-tensor] models/llama/ggml-model.gguf [models/llama/ggml-model-quant.gguf] type [nthreads]
//
void usage(const char * executable) {
- fprintf(stderr, "usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
- fprintf(stderr, " --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
- fprintf(stderr, " --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
- fprintf(stderr, "\nAllowed quantization types:\n");
+ printf("usage: %s [--help] [--allow-requantize] [--leave-output-tensor] model-f32.gguf [model-quant.gguf] type [nthreads]\n\n", executable);
+ printf(" --allow-requantize: Allows requantizing tensors that have already been quantized. Warning: This can severely reduce quality compared to quantizing from 16bit or 32bit\n");
+ printf(" --leave-output-tensor: Will leave output.weight un(re)quantized. Increases model size but may also increase quality, especially when requantizing\n");
+ printf("\nAllowed quantization types:\n");
for (auto & it : QUANT_OPTIONS) {
- printf(" %2d or %-6s : %s\n", it.ftype, it.name.c_str(), it.desc.c_str());
+ if (it.name != "COPY") {
+ printf(" %2d or ", it.ftype);
+ } else {
+ printf(" ");
+ }
+ printf("%-6s : %s\n", it.name.c_str(), it.desc.c_str());
}
exit(1);
}
// export as [inp path]/ggml-model-[ftype].gguf
fname_out = fpath + "ggml-model-" + ftype_str + ".gguf";
arg_idx++;
+ if (ftype_str == "COPY") {
+ params.only_copy = true;
+ }
}
else {
fname_out = argv[arg_idx];
if (!try_parse_ftype(argv[arg_idx], params.ftype, ftype_str)) {
fprintf(stderr, "%s: invalid ftype '%s'\n", __func__, argv[3]);
return 1;
+ } else {
+ if (ftype_str == "COPY") {
+ params.only_copy = true;
+ }
}
arg_idx++;
}
llm_load_arch(*ml, model);
llm_load_hparams(*ml, model, 0, 0, 0);
+ if (params->only_copy) {
+ ftype = model.ftype;
+ }
+
const size_t align = GGUF_DEFAULT_ALIGNMENT;
struct gguf_context * ctx_out = gguf_init_empty();
// quantize only 2D tensors
quantize &= (tensor->n_dims == 2);
quantize &= params->quantize_output_tensor || name != "output.weight";
- quantize &= quantized_type != tensor->type;
+ quantize &= !params->only_copy;
enum ggml_type new_type;
void * new_data;
size_t new_size;
- if (!quantize) {
- new_type = tensor->type;
- new_data = tensor->data;
- new_size = ggml_nbytes(tensor);
- LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
- } else {
+ if (quantize) {
new_type = quantized_type;
#ifdef GGML_USE_K_QUANTS
// TODO: avoid hardcoded tensor names - use the TN_* constants
}
}
#endif
-
+ // If we've decided to quantize to the same type the tensor is already
+ // in then there's nothing to do.
+ quantize = tensor->type != new_type;
+ }
+ if (!quantize) {
+ new_type = tensor->type;
+ new_data = tensor->data;
+ new_size = ggml_nbytes(tensor);
+ LLAMA_LOG_INFO("size = %8.3f MB\n", ggml_nbytes(tensor)/1024.0/1024.0);
+ } else {
const size_t nelements = ggml_nelements(tensor);
float * f32_data;
/*.ftype =*/ LLAMA_FTYPE_MOSTLY_Q5_1,
/*.allow_requantize =*/ false,
/*.quantize_output_tensor =*/ true,
+ /*.only_copy =*/ false,
};
return result;