From: Georgi Gerganov <redacted>
Date: Sat, 22 Apr 2023 09:52:25 +0000 (+0300)
Subject: examples : add Q4_2 and Q4_3 quantization support
X-Git-Tag: upstream/0.0.1642~1531
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=818d7a7da52c2e5c7f2aecc2f5909bd41bc1b351;p=pkg%2Fggml%2Fsources%2Fggml

examples : add Q4_2 and Q4_3 quantization support
---

diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp
index 5998d53f..c52cae1b 100644
--- a/examples/gpt-2/main.cpp
+++ b/examples/gpt-2/main.cpp
@@ -139,6 +139,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
         case 1: wtype = GGML_TYPE_F16;  break;
         case 2: wtype = GGML_TYPE_Q4_0; break;
         case 3: wtype = GGML_TYPE_Q4_1; break;
+        case 5: wtype = GGML_TYPE_Q4_2; break;
+        case 6: wtype = GGML_TYPE_Q4_3; break;
         default:
                 {
                     fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
@@ -345,7 +347,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
             }
 
             if (0) {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
                 printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
             }
 
@@ -356,6 +358,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab &
                 case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
                 case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
                 case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
+                case 5: bpe = ggml_type_size(GGML_TYPE_Q4_2); assert(ne[0] % 64 == 0); break;
+                case 6: bpe = ggml_type_size(GGML_TYPE_Q4_3); assert(ne[0] % 64 == 0); break;
                 default:
                         {
                             fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
diff --git a/examples/gpt-2/quantize.cpp b/examples/gpt-2/quantize.cpp
index afb29999..fd9a22d5 100644
--- a/examples/gpt-2/quantize.cpp
+++ b/examples/gpt-2/quantize.cpp
@@ -29,10 +29,12 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
     switch (itype) {
         case 2: type = GGML_TYPE_Q4_0; break;
         case 3: type = GGML_TYPE_Q4_1; break;
+        case 5: type = GGML_TYPE_Q4_2; break;
+        case 6: type = GGML_TYPE_Q4_3; break;
         default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
     };
 
-    if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
+    if (!ggml_is_quantized(type)) {
         fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
         return false;
     }
@@ -155,7 +157,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
             finp.read (&name[0], length);
 
             {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
                 printf("%24s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
             }
 
@@ -227,6 +229,14 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam
                         {
                             cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                         } break;
+                    case GGML_TYPE_Q4_2:
+                        {
+                            cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        } break;
+                    case GGML_TYPE_Q4_3:
+                        {
+                            cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        } break;
                     default:
                         {
                             fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
@@ -286,6 +296,8 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
         fprintf(stderr, "  type = 2 - q4_0\n");
         fprintf(stderr, "  type = 3 - q4_1\n");
+        fprintf(stderr, "  type = 5 - q4_2\n");
+        fprintf(stderr, "  type = 6 - q4_3\n");
         return 1;
     }
 
diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp
index 96615f1d..fbd7c314 100644
--- a/examples/gpt-j/main.cpp
+++ b/examples/gpt-j/main.cpp
@@ -140,6 +140,8 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
         case 1: wtype = GGML_TYPE_F16;  break;
         case 2: wtype = GGML_TYPE_Q4_0; break;
         case 3: wtype = GGML_TYPE_Q4_1; break;
+        case 5: wtype = GGML_TYPE_Q4_2; break;
+        case 6: wtype = GGML_TYPE_Q4_3; break;
         default:
                 {
                     fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n",
@@ -341,7 +343,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
             }
 
             if (0) {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
                 printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
             }
 
@@ -352,6 +354,8 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab &
                 case 1: bpe = ggml_type_size(GGML_TYPE_F16);  break;
                 case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
                 case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
+                case 5: bpe = ggml_type_size(GGML_TYPE_Q4_2); assert(ne[0] % 64 == 0); break;
+                case 6: bpe = ggml_type_size(GGML_TYPE_Q4_3); assert(ne[0] % 64 == 0); break;
                 default:
                         {
                             fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
diff --git a/examples/gpt-j/quantize.cpp b/examples/gpt-j/quantize.cpp
index 611241af..2fff95bd 100644
--- a/examples/gpt-j/quantize.cpp
+++ b/examples/gpt-j/quantize.cpp
@@ -30,10 +30,12 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
     switch (itype) {
         case 2: type = GGML_TYPE_Q4_0; break;
         case 3: type = GGML_TYPE_Q4_1; break;
+        case 5: type = GGML_TYPE_Q4_2; break;
+        case 6: type = GGML_TYPE_Q4_3; break;
         default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
     };
 
-    if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
+    if (!ggml_is_quantized(type)) {
         fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
         return false;
     }
@@ -158,7 +160,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
             finp.read (&name[0], length);
 
             {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
                 printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
             }
 
@@ -228,6 +230,14 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam
                         {
                             cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                         } break;
+                    case GGML_TYPE_Q4_2:
+                        {
+                            cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        } break;
+                    case GGML_TYPE_Q4_3:
+                        {
+                            cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        } break;
                     default:
                         {
                             fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
@@ -287,6 +297,8 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
         fprintf(stderr, "  type = 2 - q4_0\n");
         fprintf(stderr, "  type = 3 - q4_1\n");
+        fprintf(stderr, "  type = 5 - q4_2\n");
+        fprintf(stderr, "  type = 6 - q4_3\n");
         return 1;
     }
 
diff --git a/examples/stablelm/main.cpp b/examples/stablelm/main.cpp
index 7cccb687..f415bffb 100644
--- a/examples/stablelm/main.cpp
+++ b/examples/stablelm/main.cpp
@@ -347,7 +347,7 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_
             }
 
             if (0) {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", };
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
                 printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
             }
 
diff --git a/examples/stablelm/quantize.cpp b/examples/stablelm/quantize.cpp
index 2896def7..25d96168 100644
--- a/examples/stablelm/quantize.cpp
+++ b/examples/stablelm/quantize.cpp
@@ -152,7 +152,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string &
             finp.read (&name[0], length);
 
             {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2" };
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
                 printf("%64s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
             }
 
diff --git a/examples/whisper/quantize.cpp b/examples/whisper/quantize.cpp
index 0f3f2675..ae3a5b8a 100644
--- a/examples/whisper/quantize.cpp
+++ b/examples/whisper/quantize.cpp
@@ -41,10 +41,12 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
     switch (itype) {
         case 2: type = GGML_TYPE_Q4_0; break;
         case 3: type = GGML_TYPE_Q4_1; break;
+        case 5: type = GGML_TYPE_Q4_2; break;
+        case 6: type = GGML_TYPE_Q4_3; break;
         default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
     };
 
-    if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) {
+    if (!ggml_is_quantized(type)) {
         fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
         return false;
     }
@@ -196,7 +198,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
             finp.read (&name[0], length);
 
             {
-                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", };
+                static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
                 printf("%48s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ftype_str[ftype]);
             }
 
@@ -270,6 +272,14 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f
                         {
                             cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
                         } break;
+                    case GGML_TYPE_Q4_2:
+                        {
+                            cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        } break;
+                    case GGML_TYPE_Q4_3:
+                        {
+                            cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+                        } break;
                     default:
                         {
                             fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
@@ -329,6 +339,8 @@ int main(int argc, char ** argv) {
         fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
         fprintf(stderr, "  type = 2 - q4_0\n");
         fprintf(stderr, "  type = 3 - q4_1\n");
+        fprintf(stderr, "  type = 5 - q4_2\n");
+        fprintf(stderr, "  type = 6 - q4_3\n");
         return 1;
     }