ggml : add GGML_QNT_VERSION for tracking changes to the quantization format

author Georgi Gerganov <redacted>

Sun, 14 May 2023 07:07:27 +0000 (10:07 +0300)

committer Georgi Gerganov <redacted>

Sun, 14 May 2023 07:11:10 +0000 (10:11 +0300)
author Georgi Gerganov <redacted>
Sun, 14 May 2023 07:07:27 +0000 (10:07 +0300)
committer Georgi Gerganov <redacted>
Sun, 14 May 2023 07:11:10 +0000 (10:11 +0300)
diff --git a/examples/dolly-v2/main.cpp b/examples/dolly-v2/main.cpp

index 76a29be74544a5e6e4b91bcaf15abfc4af05e8c7..70aa8ded4a77730b5dc4b87e2b77163d1b376d31 100644 (file)
--- a/examples/dolly-v2/main.cpp
+++ b/examples/dolly-v2/main.cpp
@@ -114,6 +114,8 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo
          fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
          fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
  
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
          printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
          printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
          printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
@@ -121,6 +123,9 @@ bool dollyv2_model_load(const std::string & fname, dollyv2_model & model, gpt_vo
          printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
          printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
          printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
      }
  
      // load vocab
diff --git a/examples/dolly-v2/quantize.cpp b/examples/dolly-v2/quantize.cpp

index 6df1a06149ebe2c9b4c4fc2cc1621206a24a41f3..83f11e7574cf14c33e893a2dd434a4c5465d7393 100644 (file)
--- a/examples/dolly-v2/quantize.cpp
+++ b/examples/dolly-v2/quantize.cpp
@@ -66,12 +66,18 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
          finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
          finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
  
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
  
          fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
          fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
@@ -79,7 +85,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
          fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
          fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
          fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fout.write((char *) &ftype,           sizeof(hparams.ftype));
+        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
      }
  
      // load vocab
diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp

index ad00d2d5c04323de4254828788cfbf6defa48aaf..2673510adcfda6ae0700b5660f1b89223f9d34b7 100644 (file)
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -100,12 +100,17 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
          fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
          fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
  
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
          printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
          printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
          printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
          printf("%s: n_head  = %d\n", __func__, hparams.n_head);
          printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
          printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
      }
  
      // load vocab
diff --git a/examples/gpt-2/quantize.cpp b/examples/gpt-2/quantize.cpp

index 09b99ffec365a1913f478059a055b5c34bdc3905..d8021895233bbc3298bba3c4d3de2a88c20d3755 100644 (file)
--- a/examples/gpt-2/quantize.cpp
+++ b/examples/gpt-2/quantize.cpp
@@ -20,7 +20,7 @@ struct gpt2_hparams {
      int32_t n_embd  = 768;
      int32_t n_head  = 12;
      int32_t n_layer = 12;
-    int32_t f16     = 1;
+    int32_t ftype   = 1;
  };
  
  // quantize a model
@@ -62,21 +62,27 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
          finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
          finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
          finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        finp.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
  
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
  
          fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
          fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
          fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
          fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
          fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fout.write((char *) &ftype,           sizeof(hparams.f16));
+        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
      }
  
      // load vocab
diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp

index c3d2228bedc1c5166bbbeeedf4cd85d9a2d61958..51be95918d248a6d45a4d1f4cfbac949989e7b74 100644 (file)
--- a/examples/gpt-j/main.cpp
+++ b/examples/gpt-j/main.cpp
@@ -100,6 +100,8 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
          fin.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
          fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
  
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
          printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
          printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
          printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
@@ -107,6 +109,9 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
          printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
          printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
          printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
      }
  
      // load vocab
diff --git a/examples/gpt-j/quantize.cpp b/examples/gpt-j/quantize.cpp

index 3f9f4b38333f3fc6c768f4006259a5988d33286e..0c1f795f46a8b5b151080e3ba566ad7830e15ace 100644 (file)
--- a/examples/gpt-j/quantize.cpp
+++ b/examples/gpt-j/quantize.cpp
@@ -21,7 +21,7 @@ struct gptj_hparams {
      int32_t n_head  = 16;
      int32_t n_layer = 28;
      int32_t n_rot   = 64;
-    int32_t f16     = 1;
+    int32_t ftype   = 1;
  };
  
  // quantize a model
@@ -64,14 +64,20 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
          finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
          finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
          finp.read((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        finp.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
  
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
  
          fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
          fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
@@ -79,7 +85,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
          fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
          fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
          fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
-        fout.write((char *) &ftype,           sizeof(hparams.f16));
+        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
      }
  
      // load vocab
diff --git a/examples/gpt-neox/main.cpp b/examples/gpt-neox/main.cpp

index 4b18ab2df17e7c040016e1c2b87a925a2d2f5378..366a1ecb0b5e32207821b94736c0e316ee3d300f 100644 (file)
--- a/examples/gpt-neox/main.cpp
+++ b/examples/gpt-neox/main.cpp
@@ -106,6 +106,8 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_
          fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
          fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
  
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
          printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
          printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
          printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
@@ -114,6 +116,9 @@ bool gpt_neox_model_load(const std::string & fname, gpt_neox_model & model, gpt_
          printf("%s: n_rot   = %d\n", __func__, hparams.n_rot);
          printf("%s: par_res = %d\n", __func__, hparams.par_res);
          printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
      }
  
      // load vocab
diff --git a/examples/gpt-neox/quantize.cpp b/examples/gpt-neox/quantize.cpp

index c5fbdb28b5bf64d823932abde888aabb18d6e223..ac7d681cbddc98277858cd59df7b7d9922531d70 100644 (file)
--- a/examples/gpt-neox/quantize.cpp
+++ b/examples/gpt-neox/quantize.cpp
@@ -68,13 +68,19 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
          finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
          finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
  
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: par_res = %d\n", __func__, hparams.par_res);
-        printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
+        printf("%s: par_res     = %d\n", __func__, hparams.par_res);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
  
          fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
          fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
@@ -83,7 +89,7 @@ bool gpt_neox_model_quantize(const std::string & fname_inp, const std::string &
          fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
          fout.write((char *) &hparams.n_rot,   sizeof(hparams.n_rot));
          fout.write((char *) &hparams.par_res, sizeof(hparams.par_res));
-        fout.write((char *) &ftype,           sizeof(hparams.ftype));
+        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
      }
  
      // load vocab
diff --git a/examples/starcoder/main.cpp b/examples/starcoder/main.cpp

index d625a22a97b5fa5015def5e3fe18a35db8f58c02..202702db1685ae67f130a3fdcc349347f9d800cf 100644 (file)
--- a/examples/starcoder/main.cpp
+++ b/examples/starcoder/main.cpp
@@ -101,12 +101,17 @@ bool starcoder_model_load(const std::string & fname, starcoder_model & model, gp
          fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
          fin.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
  
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
          printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
          printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
          printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
          printf("%s: n_head  = %d\n", __func__, hparams.n_head);
          printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
          printf("%s: ftype   = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr   = %d\n", __func__, qntvr);
+
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
      }
  
      // load vocab
diff --git a/examples/starcoder/quantize.cpp b/examples/starcoder/quantize.cpp

index 7811539925916efc2f810bf1193ba9211bce0d8d..101af50910af3fc237cf2c2d37c9eb6b78590b1d 100644 (file)
--- a/examples/starcoder/quantize.cpp
+++ b/examples/starcoder/quantize.cpp
@@ -20,7 +20,7 @@ struct starcoder_hparams {
      int32_t n_embd  = 2048;
      int32_t n_head  = 16;
      int32_t n_layer = 24;
-    int32_t f16     = 1;
+    int32_t ftype   = 1;
  };
  
  // quantize a model
@@ -62,21 +62,27 @@ bool starcoder_model_quantize(const std::string & fname_inp, const std::string &
          finp.read((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
          finp.read((char *) &hparams.n_head,  sizeof(hparams.n_head));
          finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        finp.read((char *) &hparams.f16,     sizeof(hparams.f16));
+        finp.read((char *) &hparams.ftype,   sizeof(hparams.ftype));
  
-        printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
-        printf("%s: n_ctx   = %d\n", __func__, hparams.n_ctx);
-        printf("%s: n_embd  = %d\n", __func__, hparams.n_embd);
-        printf("%s: n_head  = %d\n", __func__, hparams.n_head);
-        printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
-        printf("%s: f16     = %d\n", __func__, hparams.f16);
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+        printf("%s: n_vocab     = %d\n", __func__, hparams.n_vocab);
+        printf("%s: n_ctx       = %d\n", __func__, hparams.n_ctx);
+        printf("%s: n_embd      = %d\n", __func__, hparams.n_embd);
+        printf("%s: n_head      = %d\n", __func__, hparams.n_head);
+        printf("%s: n_layer     = %d\n", __func__, hparams.n_layer);
+        printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+        printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+        printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+        printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
  
          fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
          fout.write((char *) &hparams.n_ctx,   sizeof(hparams.n_ctx));
          fout.write((char *) &hparams.n_embd,  sizeof(hparams.n_embd));
          fout.write((char *) &hparams.n_head,  sizeof(hparams.n_head));
          fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
-        fout.write((char *) &ftype,           sizeof(hparams.f16));
+        fout.write((char *) &ftype_dst,       sizeof(ftype_dst));
      }
  
      // load vocab
diff --git a/examples/whisper/quantize.cpp b/examples/whisper/quantize.cpp

index dc20e4ea972c2741d65451beb8f159de580b7021..949fe8e6535233657d50ef87ef65a508ed0ecf8b 100644 (file)
--- a/examples/whisper/quantize.cpp
+++ b/examples/whisper/quantize.cpp
@@ -25,7 +25,7 @@ struct whisper_hparams {
      int32_t n_text_head   = 6;
      int32_t n_text_layer  = 4;
      int32_t n_mels        = 80;
-    int32_t f16           = 1;
+    int32_t ftype         = 1;
  };
  
  struct whisper_filters {
@@ -79,7 +79,10 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
          finp.read((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
          finp.read((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
          finp.read((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
-        finp.read((char *) &hparams.f16,           sizeof(hparams.f16));
+        finp.read((char *) &hparams.ftype,         sizeof(hparams.ftype));
+
+        const int32_t qntvr_src =    hparams.ftype / GGML_QNT_VERSION_FACTOR;
+        const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
  
          fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
          fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
@@ -91,7 +94,10 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
          fprintf(stderr, "%s: n_text_head   = %d\n", __func__, hparams.n_text_head);
          fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
          fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
-        fprintf(stderr, "%s: f16           = %d\n", __func__, hparams.f16);
+        fprintf(stderr, "%s: ftype (src)   = %d\n", __func__, hparams.ftype);
+        fprintf(stderr, "%s: qntvr (src)   = %d\n", __func__, qntvr_src);
+        fprintf(stderr, "%s: ftype (dst)   = %d\n", __func__, ftype_dst);
+        fprintf(stderr, "%s: qntvr (dst)   = %d\n", __func__, GGML_QNT_VERSION);
  
          fout.write((char *) &hparams.n_vocab,       sizeof(hparams.n_vocab));
          fout.write((char *) &hparams.n_audio_ctx,   sizeof(hparams.n_audio_ctx));
@@ -103,7 +109,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
          fout.write((char *) &hparams.n_text_head,   sizeof(hparams.n_text_head));
          fout.write((char *) &hparams.n_text_layer,  sizeof(hparams.n_text_layer));
          fout.write((char *) &hparams.n_mels,        sizeof(hparams.n_mels));
-        fout.write((char *) &ftype,                 sizeof(hparams.f16));
+        fout.write((char *) &ftype,                 sizeof(hparams.ftype));
      }
  
      // load mel filters
diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp

index 158aa0b9881bb8a1ab249e646f7f87932c926831..9fd204173643f8683af918519b82c12392fd973b 100644 (file)
--- a/examples/whisper/whisper.cpp
+++ b/examples/whisper/whisper.cpp
@@ -861,6 +861,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
              model.type = e_model::MODEL_LARGE;
          }
  
+        hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+
          // for the big tensors, we have the option to store the data in 16-bit floats or quantized
          // in order to save memory and also to speed up the computation
          wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
@@ -871,6 +873,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
  
          const size_t scale = model.hparams.ftype ? 1 : 2;
  
+        const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
          fprintf(stderr, "%s: n_vocab       = %d\n", __func__, hparams.n_vocab);
          fprintf(stderr, "%s: n_audio_ctx   = %d\n", __func__, hparams.n_audio_ctx);
          fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
@@ -882,6 +886,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
          fprintf(stderr, "%s: n_text_layer  = %d\n", __func__, hparams.n_text_layer);
          fprintf(stderr, "%s: n_mels        = %d\n", __func__, hparams.n_mels);
          fprintf(stderr, "%s: ftype         = %d\n", __func__, model.hparams.ftype);
+        fprintf(stderr, "%s: qntvr         = %d\n", __func__, qntvr);
          fprintf(stderr, "%s: type          = %d\n", __func__, model.type);
  
          // print memory requirements
diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h

index 508dd69b41713209d2339bd06148fcc96ceebec1..4ece9cf4df4dfd582d33bdb1099edc9f08fcbdf4 100644 (file)
--- a/include/ggml/ggml.h
+++ b/include/ggml/ggml.h
@@ -190,6 +190,9 @@
  #define GGML_FILE_MAGIC   0x67676d6c // "ggml"
  #define GGML_FILE_VERSION 1
  
+#define GGML_QNT_VERSION        0    // bump this on quantization format changes
+#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
+
  #define GGML_MAX_DIMS          4
  #define GGML_MAX_NODES         4096
  #define GGML_MAX_PARAMS        16
author	Georgi Gerganov <redacted>
	Sun, 14 May 2023 07:07:27 +0000 (10:07 +0300)
committer	Georgi Gerganov <redacted>
	Sun, 14 May 2023 07:11:10 +0000 (10:11 +0300)
examples/dolly-v2/main.cpp		patch \| blob \| history
examples/dolly-v2/quantize.cpp		patch \| blob \| history
examples/gpt-2/main.cpp		patch \| blob \| history
examples/gpt-2/quantize.cpp		patch \| blob \| history
examples/gpt-j/main.cpp		patch \| blob \| history
examples/gpt-j/quantize.cpp		patch \| blob \| history
examples/gpt-neox/main.cpp		patch \| blob \| history
examples/gpt-neox/quantize.cpp		patch \| blob \| history
examples/starcoder/main.cpp		patch \| blob \| history
examples/starcoder/quantize.cpp		patch \| blob \| history
examples/whisper/quantize.cpp		patch \| blob \| history
examples/whisper/whisper.cpp		patch \| blob \| history
include/ggml/ggml.h		patch \| blob \| history