From: Georgi Gerganov Date: Sun, 14 May 2023 07:07:27 +0000 (+0300) Subject: ggml : add GGML_QNT_VERSION for tracking changes to the quantization format X-Git-Tag: upstream/0.0.1642~1478 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=effcfa62da543e71affe6c39b78d0064f0c5d71d;p=pkg%2Fggml%2Fsources%2Fggml ggml : add GGML_QNT_VERSION for tracking changes to the quantization format ref #150 --- diff --git a/examples/dolly-v2/main.cpp b/examples/dolly-v2/main.cpp index 76a29be7..70aa8ded 100644 --- a/examples/dolly-v2/main.cpp +++ b/examples/dolly-v2/main.cpp @@ -114,6 +114,8 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); @@ -121,6 +123,9 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo printf("%s: n_layer = %d\n", __func__, hparams.n_layer); printf("%s: n_rot = %d\n", __func__, hparams.n_rot); printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; } // load vocab diff --git a/examples/dolly-v2/quantize.cpp b/examples/dolly-v2/quantize.cpp index 6df1a061..83f11e75 100644 --- a/examples/dolly-v2/quantize.cpp +++ b/examples/dolly-v2/quantize.cpp @@ -66,12 +66,18 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string & finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: ftype = %d\n", __func__, hparams.ftype); + const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; + const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); + printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); + printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); + printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); @@ -79,7 +85,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fout.write((char *) &ftype, sizeof(hparams.ftype)); + fout.write((char *) &ftype_dst, sizeof(ftype_dst)); } // load vocab diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index ad00d2d5..2673510a 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -100,12 +100,17 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_layer = %d\n", __func__, hparams.n_layer); printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; } // load vocab diff --git a/examples/gpt-2/quantize.cpp b/examples/gpt-2/quantize.cpp index 09b99ffe..d8021895 100644 --- a/examples/gpt-2/quantize.cpp +++ b/examples/gpt-2/quantize.cpp @@ -20,7 +20,7 @@ struct gpt2_hparams { int32_t n_embd = 768; int32_t n_head = 12; int32_t n_layer = 12; - int32_t f16 = 1; + int32_t ftype = 1; }; // quantize a model @@ -62,21 +62,27 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - finp.read((char *) &hparams.f16, sizeof(hparams.f16)); + finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: f16 = %d\n", __func__, hparams.f16); + const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; + const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); + printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); + printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); + printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd)); fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fout.write((char *) &ftype, sizeof(hparams.f16)); + fout.write((char *) &ftype_dst, sizeof(ftype_dst)); } // load vocab diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp index c3d2228b..51be9591 100644 --- a/examples/gpt-j/main.cpp +++ b/examples/gpt-j/main.cpp @@ -100,6 +100,8 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); @@ -107,6 +109,9 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & printf("%s: n_layer = %d\n", __func__, hparams.n_layer); printf("%s: n_rot = %d\n", __func__, hparams.n_rot); printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; } // load vocab diff --git a/examples/gpt-j/quantize.cpp b/examples/gpt-j/quantize.cpp index 3f9f4b38..0c1f795f 100644 --- a/examples/gpt-j/quantize.cpp +++ b/examples/gpt-j/quantize.cpp @@ -21,7 +21,7 @@ struct gptj_hparams { int32_t n_head = 16; int32_t n_layer = 28; int32_t n_rot = 64; - int32_t f16 = 1; + int32_t ftype = 1; }; // quantize a model @@ -64,14 +64,20 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - finp.read((char *) &hparams.f16, sizeof(hparams.f16)); + finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: f16 = %d\n", __func__, hparams.f16); + const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; + const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); + printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); + printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); + printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); @@ -79,7 +85,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot)); - fout.write((char *) &ftype, sizeof(hparams.f16)); + fout.write((char *) &ftype_dst, sizeof(ftype_dst)); } // load vocab diff --git a/examples/gpt-neox/main.cpp b/examples/gpt-neox/main.cpp index 4b18ab2d..366a1ecb 100644 --- a/examples/gpt-neox/main.cpp +++ b/examples/gpt-neox/main.cpp @@ -106,6 +106,8 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_ fin.read((char *) &hparams.par_res, sizeof(hparams.par_res)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); @@ -114,6 +116,9 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_ printf("%s: n_rot = %d\n", __func__, hparams.n_rot); printf("%s: par_res = %d\n", __func__, hparams.par_res); printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; } // load vocab diff --git a/examples/gpt-neox/quantize.cpp b/examples/gpt-neox/quantize.cpp index c5fbdb28..ac7d681c 100644 --- a/examples/gpt-neox/quantize.cpp +++ b/examples/gpt-neox/quantize.cpp @@ -68,13 +68,19 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & finp.read((char *) &hparams.par_res, sizeof(hparams.par_res)); finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: par_res = %d\n", __func__, hparams.par_res); - printf("%s: ftype = %d\n", __func__, hparams.ftype); + const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; + const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: par_res = %d\n", __func__, hparams.par_res); + printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); + printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); + printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); + printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); @@ -83,7 +89,7 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string & fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot)); fout.write((char *) &hparams.par_res, sizeof(hparams.par_res)); - fout.write((char *) &ftype, sizeof(hparams.ftype)); + fout.write((char *) &ftype_dst, sizeof(ftype_dst)); } // load vocab diff --git a/examples/starcoder/main.cpp b/examples/starcoder/main.cpp index d625a22a..202702db 100644 --- a/examples/starcoder/main.cpp +++ b/examples/starcoder/main.cpp @@ -101,12 +101,17 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); printf("%s: n_embd = %d\n", __func__, hparams.n_embd); printf("%s: n_head = %d\n", __func__, hparams.n_head); printf("%s: n_layer = %d\n", __func__, hparams.n_layer); printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; } // load vocab diff --git a/examples/starcoder/quantize.cpp b/examples/starcoder/quantize.cpp index 78115399..101af509 100644 --- a/examples/starcoder/quantize.cpp +++ b/examples/starcoder/quantize.cpp @@ -20,7 +20,7 @@ struct starcoder_hparams { int32_t n_embd = 2048; int32_t n_head = 16; int32_t n_layer = 24; - int32_t f16 = 1; + int32_t ftype = 1; }; // quantize a model @@ -62,21 +62,27 @@ bool starcoder_model_quantize(const std::string & fname_inp, const std::string & finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); finp.read((char *) &hparams.n_head, sizeof(hparams.n_head)); finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - finp.read((char *) &hparams.f16, sizeof(hparams.f16)); + finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: f16 = %d\n", __func__, hparams.f16); + const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; + const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); + printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); + printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); + printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd)); fout.write((char *) &hparams.n_head, sizeof(hparams.n_head)); fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fout.write((char *) &ftype, sizeof(hparams.f16)); + fout.write((char *) &ftype_dst, sizeof(ftype_dst)); } // load vocab diff --git a/examples/whisper/quantize.cpp b/examples/whisper/quantize.cpp index dc20e4ea..949fe8e6 100644 --- a/examples/whisper/quantize.cpp +++ b/examples/whisper/quantize.cpp @@ -25,7 +25,7 @@ struct whisper_hparams { int32_t n_text_head = 6; int32_t n_text_layer = 4; int32_t n_mels = 80; - int32_t f16 = 1; + int32_t ftype = 1; }; struct whisper_filters { @@ -79,7 +79,10 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f finp.read((char *) &hparams.n_text_head, sizeof(hparams.n_text_head)); finp.read((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer)); finp.read((char *) &hparams.n_mels, sizeof(hparams.n_mels)); - finp.read((char *) &hparams.f16, sizeof(hparams.f16)); + finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; + const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx); @@ -91,7 +94,10 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head); fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer); fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels); - fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); + fprintf(stderr, "%s: ftype (src) = %d\n", __func__, hparams.ftype); + fprintf(stderr, "%s: qntvr (src) = %d\n", __func__, qntvr_src); + fprintf(stderr, "%s: ftype (dst) = %d\n", __func__, ftype_dst); + fprintf(stderr, "%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); fout.write((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx)); @@ -103,7 +109,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head)); fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer)); fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels)); - fout.write((char *) &ftype, sizeof(hparams.f16)); + fout.write((char *) &ftype, sizeof(hparams.ftype)); } // load mel filters diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp index 158aa0b9..9fd20417 100644 --- a/examples/whisper/whisper.cpp +++ b/examples/whisper/whisper.cpp @@ -861,6 +861,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con model.type = e_model::MODEL_LARGE; } + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + // for the big tensors, we have the option to store the data in 16-bit floats or quantized // in order to save memory and also to speed up the computation wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); @@ -871,6 +873,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con const size_t scale = model.hparams.ftype ? 1 : 2; + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx); fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state); @@ -882,6 +886,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer); fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels); fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype); + fprintf(stderr, "%s: qntvr = %d\n", __func__, qntvr); fprintf(stderr, "%s: type = %d\n", __func__, model.type); // print memory requirements diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h index 508dd69b..4ece9cf4 100644 --- a/include/ggml/ggml.h +++ b/include/ggml/ggml.h @@ -190,6 +190,9 @@ #define GGML_FILE_MAGIC 0x67676d6c // "ggml" #define GGML_FILE_VERSION 1 +#define GGML_QNT_VERSION 0 // bump this on quantization format changes +#define GGML_QNT_VERSION_FACTOR 1000 // do not change this + #define GGML_MAX_DIMS 4 #define GGML_MAX_NODES 4096 #define GGML_MAX_PARAMS 16