From: Georgi Gerganov Date: Sat, 22 Apr 2023 09:52:25 +0000 (+0300) Subject: examples : add Q4_2 and Q4_3 quantization support X-Git-Tag: upstream/0.0.1642~1531 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=818d7a7da52c2e5c7f2aecc2f5909bd41bc1b351;p=pkg%2Fggml%2Fsources%2Fggml examples : add Q4_2 and Q4_3 quantization support --- diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp index 5998d53f..c52cae1b 100644 --- a/examples/gpt-2/main.cpp +++ b/examples/gpt-2/main.cpp @@ -139,6 +139,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & case 1: wtype = GGML_TYPE_F16; break; case 2: wtype = GGML_TYPE_Q4_0; break; case 3: wtype = GGML_TYPE_Q4_1; break; + case 5: wtype = GGML_TYPE_Q4_2; break; + case 6: wtype = GGML_TYPE_Q4_3; break; default: { fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", @@ -345,7 +347,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & } if (0) { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; + static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" }; printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); } @@ -356,6 +358,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & case 1: bpe = ggml_type_size(GGML_TYPE_F16); break; case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break; case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break; + case 5: bpe = ggml_type_size(GGML_TYPE_Q4_2); assert(ne[0] % 64 == 0); break; + case 6: bpe = ggml_type_size(GGML_TYPE_Q4_3); assert(ne[0] % 64 == 0); break; default: { fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); diff --git a/examples/gpt-2/quantize.cpp b/examples/gpt-2/quantize.cpp index afb29999..fd9a22d5 100644 --- a/examples/gpt-2/quantize.cpp +++ b/examples/gpt-2/quantize.cpp @@ -29,10 +29,12 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam switch (itype) { case 2: type = GGML_TYPE_Q4_0; break; case 3: type = GGML_TYPE_Q4_1; break; + case 5: type = GGML_TYPE_Q4_2; break; + case 6: type = GGML_TYPE_Q4_3; break; default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1; }; - if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) { + if (!ggml_is_quantized(type)) { fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type); return false; } @@ -155,7 +157,7 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam finp.read (&name[0], length); { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; + static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" }; printf("%24s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]); } @@ -227,6 +229,14 @@ bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fnam { cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; + case GGML_TYPE_Q4_2: + { + cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q4_3: + { + cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; default: { fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type); @@ -286,6 +296,8 @@ int main(int argc, char ** argv) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); fprintf(stderr, " type = 2 - q4_0\n"); fprintf(stderr, " type = 3 - q4_1\n"); + fprintf(stderr, " type = 5 - q4_2\n"); + fprintf(stderr, " type = 6 - q4_3\n"); return 1; } diff --git a/examples/gpt-j/main.cpp b/examples/gpt-j/main.cpp index 96615f1d..fbd7c314 100644 --- a/examples/gpt-j/main.cpp +++ b/examples/gpt-j/main.cpp @@ -140,6 +140,8 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & case 1: wtype = GGML_TYPE_F16; break; case 2: wtype = GGML_TYPE_Q4_0; break; case 3: wtype = GGML_TYPE_Q4_1; break; + case 5: wtype = GGML_TYPE_Q4_2; break; + case 6: wtype = GGML_TYPE_Q4_3; break; default: { fprintf(stderr, "%s: invalid model file '%s' (bad f16 value %d)\n", @@ -341,7 +343,7 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & } if (0) { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; + static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" }; printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); } @@ -352,6 +354,8 @@ bool gptj_model_load(const std::string & fname, gptj_model & model, gpt_vocab & case 1: bpe = ggml_type_size(GGML_TYPE_F16); break; case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break; case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break; + case 5: bpe = ggml_type_size(GGML_TYPE_Q4_2); assert(ne[0] % 64 == 0); break; + case 6: bpe = ggml_type_size(GGML_TYPE_Q4_3); assert(ne[0] % 64 == 0); break; default: { fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype); diff --git a/examples/gpt-j/quantize.cpp b/examples/gpt-j/quantize.cpp index 611241af..2fff95bd 100644 --- a/examples/gpt-j/quantize.cpp +++ b/examples/gpt-j/quantize.cpp @@ -30,10 +30,12 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam switch (itype) { case 2: type = GGML_TYPE_Q4_0; break; case 3: type = GGML_TYPE_Q4_1; break; + case 5: type = GGML_TYPE_Q4_2; break; + case 6: type = GGML_TYPE_Q4_3; break; default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1; }; - if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) { + if (!ggml_is_quantized(type)) { fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type); return false; } @@ -158,7 +160,7 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam finp.read (&name[0], length); { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; + static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" }; printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]); } @@ -228,6 +230,14 @@ bool gptj_model_quantize(const std::string & fname_inp, const std::string & fnam { cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; + case GGML_TYPE_Q4_2: + { + cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q4_3: + { + cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; default: { fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type); @@ -287,6 +297,8 @@ int main(int argc, char ** argv) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); fprintf(stderr, " type = 2 - q4_0\n"); fprintf(stderr, " type = 3 - q4_1\n"); + fprintf(stderr, " type = 5 - q4_2\n"); + fprintf(stderr, " type = 6 - q4_3\n"); return 1; } diff --git a/examples/stablelm/main.cpp b/examples/stablelm/main.cpp index 7cccb687..f415bffb 100644 --- a/examples/stablelm/main.cpp +++ b/examples/stablelm/main.cpp @@ -347,7 +347,7 @@ bool stablelm_model_load(const std::string & fname, stablelm_model & model, gpt_ } if (0) { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", }; + static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" }; printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); } diff --git a/examples/stablelm/quantize.cpp b/examples/stablelm/quantize.cpp index 2896def7..25d96168 100644 --- a/examples/stablelm/quantize.cpp +++ b/examples/stablelm/quantize.cpp @@ -152,7 +152,7 @@ bool stablelm_model_quantize(const std::string & fname_inp, const std::string & finp.read (&name[0], length); { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2" }; + static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" }; printf("%64s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]); } diff --git a/examples/whisper/quantize.cpp b/examples/whisper/quantize.cpp index 0f3f2675..ae3a5b8a 100644 --- a/examples/whisper/quantize.cpp +++ b/examples/whisper/quantize.cpp @@ -41,10 +41,12 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f switch (itype) { case 2: type = GGML_TYPE_Q4_0; break; case 3: type = GGML_TYPE_Q4_1; break; + case 5: type = GGML_TYPE_Q4_2; break; + case 6: type = GGML_TYPE_Q4_3; break; default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1; }; - if (type != GGML_TYPE_Q4_0 && type != GGML_TYPE_Q4_1) { + if (!ggml_is_quantized(type)) { fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type); return false; } @@ -196,7 +198,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f finp.read (&name[0], length); { - static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", }; + static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" }; printf("%48s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ftype_str[ftype]); } @@ -270,6 +272,14 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f { cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); } break; + case GGML_TYPE_Q4_2: + { + cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; + case GGML_TYPE_Q4_3: + { + cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data()); + } break; default: { fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type); @@ -329,6 +339,8 @@ int main(int argc, char ** argv) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); fprintf(stderr, " type = 2 - q4_0\n"); fprintf(stderr, " type = 3 - q4_1\n"); + fprintf(stderr, " type = 5 - q4_2\n"); + fprintf(stderr, " type = 6 - q4_3\n"); return 1; }