examples : add Q4_2 and Q4_3 quantization support

author Georgi Gerganov <redacted>

Sat, 22 Apr 2023 09:52:25 +0000 (12:52 +0300)

committer Georgi Gerganov <redacted>

Sat, 22 Apr 2023 09:52:43 +0000 (12:52 +0300)
author Georgi Gerganov <redacted>
Sat, 22 Apr 2023 09:52:25 +0000 (12:52 +0300)
committer Georgi Gerganov <redacted>
Sat, 22 Apr 2023 09:52:43 +0000 (12:52 +0300)
diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp

index 5998d53f6485c4cc58f79d56198d889670a42e6b..c52cae1bc0be8e65cdcfab47ed54e36d340f9fd3 100644 (file)
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -139,6 +139,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
          case 1: wtype = GGML_TYPE_F16;  break;
          case 2: wtype = GGML_TYPE_Q4_0; break;
          case 3: wtype = GGML_TYPE_Q4_1; break;
+        case 5: wtype = GGML_TYPE_Q4_2; break;
+        case 6: wtype = GGML_TYPE_Q4_3; break;
          default:
                  {
                      fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
@@ -345,7 +347,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
              }
  
              if (0) {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
                  printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
              }
  
@@ -356,6 +358,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
                  case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
                  case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
                  case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
+                case 5: bpe = ggml_type_size(GGML_TYPE_Q4_2); assert(ne[0] % 64 == 0); break;
+                case 6: bpe = ggml_type_size(GGML_TYPE_Q4_3); assert(ne[0] % 64 == 0); break;
                  default:
                          {
                              fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
diff --git a/examples/gpt-2/quantize.cpp b/examples/gpt-2/quantize.cpp

index afb29999f4d2ba4bbab182c9953ef9ab286cb996..fd9a22d5017679b249f2606d739a3a5010914870 100644 (file)
--- a/examples/gpt-2/quantize.cpp
+++ b/examples/gpt-2/quantize.cpp
@@ -29,10 +29,12 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
      switch (itype) {
          case 2: type = GGML_TYPE_Q4_0; break;
          case 3: type = GGML_TYPE_Q4_1; break;
+        case 5: type = GGML_TYPE_Q4_2; break;
+        case 6: type = GGML_TYPE_Q4_3; break;
          default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
      };
  
-    if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
+    if (!ggml_is_quantized(type)) {
          fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
          return false;
      }
@@ -155,7 +157,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
              finp.read (&name[0], length);
  
              {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
                  printf("%24s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
              }
  
@@ -227,6 +229,14 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
                          {
                              cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                          } break;
+                    case GGML_TYPE_Q4_2:
+                        {
+                            cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        } break;
+                    case GGML_TYPE_Q4_3:
+                        {
+                            cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        } break;
                      default:
                          {
                              fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
@@ -286,6 +296,8 @@ int main(int argc, char ** argv) {
          fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
          fprintf(stderr, "  type = 2 - q4_0\n");
          fprintf(stderr, "  type = 3 - q4_1\n");
+        fprintf(stderr, "  type = 5 - q4_2\n");
+        fprintf(stderr, "  type = 6 - q4_3\n");
          return 1;
      }
  
diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp

index 96615f1dcdb97b03f297e834ee6c287f1713512b..fbd7c3144138ac60ee14940bfc150bc9888eb8d0 100644 (file)
--- a/examples/gpt-j/main.cpp
+++ b/examples/gpt-j/main.cpp
@@ -140,6 +140,8 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
          case 1: wtype = GGML_TYPE_F16;  break;
          case 2: wtype = GGML_TYPE_Q4_0; break;
          case 3: wtype = GGML_TYPE_Q4_1; break;
+        case 5: wtype = GGML_TYPE_Q4_2; break;
+        case 6: wtype = GGML_TYPE_Q4_3; break;
          default:
                  {
                      fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
@@ -341,7 +343,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
              }
  
              if (0) {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
                  printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
              }
  
@@ -352,6 +354,8 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
                  case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
                  case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
                  case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
+                case 5: bpe = ggml_type_size(GGML_TYPE_Q4_2); assert(ne[0] % 64 == 0); break;
+                case 6: bpe = ggml_type_size(GGML_TYPE_Q4_3); assert(ne[0] % 64 == 0); break;
                  default:
                          {
                              fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
diff --git a/examples/gpt-j/quantize.cpp b/examples/gpt-j/quantize.cpp

index 611241afd1c568130e502eac100a1e2ae8417a2c..2fff95bd4de697bb3399c83e711a038a0fa15c0d 100644 (file)
--- a/examples/gpt-j/quantize.cpp
+++ b/examples/gpt-j/quantize.cpp
@@ -30,10 +30,12 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
      switch (itype) {
          case 2: type = GGML_TYPE_Q4_0; break;
          case 3: type = GGML_TYPE_Q4_1; break;
+        case 5: type = GGML_TYPE_Q4_2; break;
+        case 6: type = GGML_TYPE_Q4_3; break;
          default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
      };
  
-    if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
+    if (!ggml_is_quantized(type)) {
          fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
          return false;
      }
@@ -158,7 +160,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
              finp.read (&name[0], length);
  
              {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
                  printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
              }
  
@@ -228,6 +230,14 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
                          {
                              cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                          } break;
+                    case GGML_TYPE_Q4_2:
+                        {
+                            cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        } break;
+                    case GGML_TYPE_Q4_3:
+                        {
+                            cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        } break;
                      default:
                          {
                              fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
@@ -287,6 +297,8 @@ int main(int argc, char ** argv) {
          fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
          fprintf(stderr, "  type = 2 - q4_0\n");
          fprintf(stderr, "  type = 3 - q4_1\n");
+        fprintf(stderr, "  type = 5 - q4_2\n");
+        fprintf(stderr, "  type = 6 - q4_3\n");
          return 1;
      }
  
diff --git a/examples/stablelm/main.cpp b/examples/stablelm/main.cpp

index 7cccb687a47d090ae9530b9399c9614c10cfb13b..f415bffb1888a340e7323c9dfe5bb8e671febc19 100644 (file)
--- a/examples/stablelm/main.cpp
+++ b/examples/stablelm/main.cpp
@@ -347,7 +347,7 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
              }
  
              if (0) {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", };
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
                  printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
              }
  
diff --git a/examples/stablelm/quantize.cpp b/examples/stablelm/quantize.cpp

index 2896def74bc75c05d884fab8bbc3939f0f55d748..25d96168df7272099ecfc90184cf509b56112f83 100644 (file)
--- a/examples/stablelm/quantize.cpp
+++ b/examples/stablelm/quantize.cpp
@@ -152,7 +152,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
              finp.read (&name[0], length);
  
              {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2" };
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
                  printf("%64s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
              }
  
diff --git a/examples/whisper/quantize.cpp b/examples/whisper/quantize.cpp

index 0f3f267532a091458d17d868f05649ac5833724f..ae3a5b8a9e3a4fcddd98c5932c91a6833948556a 100644 (file)
--- a/examples/whisper/quantize.cpp
+++ b/examples/whisper/quantize.cpp
@@ -41,10 +41,12 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
      switch (itype) {
          case 2: type = GGML_TYPE_Q4_0; break;
          case 3: type = GGML_TYPE_Q4_1; break;
+        case 5: type = GGML_TYPE_Q4_2; break;
+        case 6: type = GGML_TYPE_Q4_3; break;
          default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
      };
  
-    if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
+    if (!ggml_is_quantized(type)) {
          fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
          return false;
      }
@@ -196,7 +198,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
              finp.read (&name[0], length);
  
              {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
                  printf("%48s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ftype_str[ftype]);
              }
  
@@ -270,6 +272,14 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
                          {
                              cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                          } break;
+                    case GGML_TYPE_Q4_2:
+                        {
+                            cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        } break;
+                    case GGML_TYPE_Q4_3:
+                        {
+                            cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        } break;
                      default:
                          {
                              fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
@@ -329,6 +339,8 @@ int main(int argc, char ** argv) {
          fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
          fprintf(stderr, "  type = 2 - q4_0\n");
          fprintf(stderr, "  type = 3 - q4_1\n");
+        fprintf(stderr, "  type = 5 - q4_2\n");
+        fprintf(stderr, "  type = 6 - q4_3\n");
          return 1;
      }
author	Georgi Gerganov <redacted>
	Sat, 22 Apr 2023 09:52:25 +0000 (12:52 +0300)
committer	Georgi Gerganov <redacted>
	Sat, 22 Apr 2023 09:52:43 +0000 (12:52 +0300)
examples/gpt-2/main.cpp		patch \| blob \| history
examples/gpt-2/quantize.cpp		patch \| blob \| history
examples/gpt-j/main.cpp		patch \| blob \| history
examples/gpt-j/quantize.cpp		patch \| blob \| history
examples/stablelm/main.cpp		patch \| blob \| history
examples/stablelm/quantize.cpp		patch \| blob \| history
examples/whisper/quantize.cpp		patch \| blob \| history