target_include_directories(common PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
add_library(common-ggml STATIC common-ggml.cpp)
+target_link_libraries(common-ggml PRIVATE ggml)
target_include_directories(common-ggml PUBLIC ${CMAKE_CURRENT_SOURCE_DIR})
add_subdirectory(gpt-2)
#include "common-ggml.h"
+
+#include "ggml.h"
+
+#include <regex>
+
+bool ggml_common_quantize_0(
+ std::ifstream & finp,
+ std::ofstream & fout,
+ const ggml_mtype mtype,
+ const std::vector<std::string> & to_quant,
+ const std::vector<std::string> & to_skip) {
+
+ ggml_type qtype = GGML_TYPE_F32;
+
+ switch (mtype) {
+ case 2: qtype = GGML_TYPE_Q4_0; break;
+ case 3: qtype = GGML_TYPE_Q4_1; break;
+ case 5: qtype = GGML_TYPE_Q4_2; break;
+ case 6: qtype = GGML_TYPE_Q4_3; break;
+ default:
+ {
+ fprintf(stderr, "%s: invalid model type %d\n", __func__, mtype);
+ return false;
+ }
+ };
+
+ if (!ggml_is_quantized(qtype)) {
+ fprintf(stderr, "%s: invalid quantization type %d (%s)\n", __func__, qtype, ggml_type_name(qtype));
+ return false;
+ }
+
+ size_t total_size_org = 0;
+ size_t total_size_new = 0;
+
+ std::vector<float> work;
+
+ std::vector<uint8_t> data_u8;
+ std::vector<ggml_fp16_t> data_f16;
+ std::vector<float> data_f32;
+
+ std::vector<int64_t> hist_all(1 << 4, 0);
+
+ while (true) {
+ int32_t n_dims;
+ int32_t length;
+ int32_t ttype;
+
+ finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+ finp.read(reinterpret_cast<char *>(&length), sizeof(length));
+ finp.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
+
+ if (finp.eof()) {
+ break;
+ }
+
+ int32_t nelements = 1;
+ int32_t ne[2] = { 1, 1 };
+ for (int i = 0; i < n_dims; ++i) {
+ finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+ nelements *= ne[i];
+ }
+
+ std::string name(length, 0);
+ finp.read (&name[0], length);
+
+ printf("%64s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ggml_type_name((ggml_type) ttype));
+
+ bool quantize = false;
+
+ // check if we should quantize this tensor
+ for (const auto & s : to_quant) {
+ if (std::regex_match(name, std::regex(s))) {
+ quantize = true;
+ break;
+ }
+ }
+
+ // check if we should skip this tensor
+ for (const auto & s : to_skip) {
+ if (std::regex_match(name, std::regex(s))) {
+ quantize = false;
+ break;
+ }
+ }
+
+ // quantize only 2D tensors
+ quantize &= (n_dims == 2);
+
+ if (quantize) {
+ if (ttype != GGML_TYPE_F32 && ttype != GGML_TYPE_F16) {
+ fprintf(stderr, "%s: unsupported ttype %d (%s) for integer quantization\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
+ return false;
+ }
+
+ if (ttype == GGML_TYPE_F16) {
+ data_f16.resize(nelements);
+ finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
+ data_f32.resize(nelements);
+ for (int i = 0; i < nelements; ++i) {
+ data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
+ }
+ } else {
+ data_f32.resize(nelements);
+ finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
+ }
+
+ ttype = qtype;
+ } else {
+ const int bpe = (ttype == 0) ? sizeof(float) : sizeof(uint16_t);
+
+ data_u8.resize(nelements*bpe);
+ finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
+ }
+
+ fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+ fout.write(reinterpret_cast<char *>(&length), sizeof(length));
+ fout.write(reinterpret_cast<char *>(&ttype), sizeof(ttype));
+ for (int i = 0; i < n_dims; ++i) {
+ fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+ }
+ fout.write(&name[0], length);
+
+ if (quantize) {
+ work.resize(nelements); // for quantization
+
+ size_t cur_size = 0;
+ std::vector<int64_t> hist_cur(1 << 4, 0);
+
+ switch (ttype) {
+ case GGML_TYPE_Q4_0:
+ {
+ cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+ } break;
+ case GGML_TYPE_Q4_1:
+ {
+ cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+ } break;
+ case GGML_TYPE_Q4_2:
+ {
+ cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+ } break;
+ case GGML_TYPE_Q4_3:
+ {
+ cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
+ } break;
+ default:
+ {
+ fprintf(stderr, "%s: unsupported quantization type %d (%s)\n", __func__, ttype, ggml_type_name((ggml_type) ttype));
+ return false;
+ }
+ }
+
+ fout.write(reinterpret_cast<char *>(work.data()), cur_size);
+ total_size_new += cur_size;
+
+ printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
+ for (int i = 0; i < hist_cur.size(); ++i) {
+ hist_all[i] += hist_cur[i];
+ }
+
+ for (int i = 0; i < hist_cur.size(); ++i) {
+ printf("%5.3f ", hist_cur[i] / (float)nelements);
+ }
+ printf("\n");
+ } else {
+ printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
+ fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
+ total_size_new += data_u8.size();
+ }
+
+ total_size_org += nelements * sizeof(float);
+ }
+
+ printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
+ printf("%s: quant size = %8.2f MB | mtype = %d (%s)\n", __func__, total_size_new/1024.0/1024.0, mtype, ggml_type_name(qtype));
+
+ {
+ int64_t sum_all = 0;
+ for (int i = 0; i < hist_all.size(); ++i) {
+ sum_all += hist_all[i];
+ }
+
+ printf("%s: hist: ", __func__);
+ for (int i = 0; i < hist_all.size(); ++i) {
+ printf("%5.3f ", hist_all[i] / (float)sum_all);
+ }
+ printf("\n");
+ }
+
+ return true;
+}
#pragma once
+#include <fstream>
+#include <vector>
+#include <string>
+
+// model file types
+enum ggml_mtype {
+ GGML_MTYPE_ALL_F32 = 0,
+ GGML_MTYPE_MOSTLY_F16 = 1, // except 1d tensors
+ GGML_MTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors
+ GGML_MTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors
+ GGML_MTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16
+ GGML_MTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors
+ GGML_MTYPE_MOSTLY_Q4_3 = 6, // except 1d tensors
+};
+
+bool ggml_common_quantize_0(
+ std::ifstream & finp,
+ std::ofstream & fout,
+ const ggml_mtype mtype,
+ const std::vector<std::string> & to_quant,
+ const std::vector<std::string> & to_skip);
set(TEST_TARGET gpt-2-quantize)
add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
while (true) {
int32_t n_dims;
int32_t length;
- int32_t ftype;
+ int32_t ttype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
+ fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
if (fin.eof()) {
break;
}
int32_t nelements = 1;
- int64_t ne[2] = { 1, 1 };
+ int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
- int32_t ne_cur;
- fin.read(reinterpret_cast<char *>(&ne_cur), sizeof(ne_cur));
- ne[i] = ne_cur;
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
}
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld], expected [%lld, %lld]\n",
- __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+ __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
return false;
}
+ // for debugging
if (0) {
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
- printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+ printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
}
- size_t bpe = 0;
-
- switch (ftype) {
- case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
- case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
- case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
- case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
- case 5: bpe = ggml_type_size(GGML_TYPE_Q4_2); assert(ne[0] % 64 == 0); break;
- case 6: bpe = ggml_type_size(GGML_TYPE_Q4_3); assert(ne[0] % 64 == 0); break;
- default:
- {
- fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
- return false;
- }
- };
+ const size_t bpe = ggml_type_size(ggml_type(ttype));
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
#include "ggml/ggml.h"
#include "common.h"
+#include "common-ggml.h"
#include <cassert>
#include <cmath>
};
// quantize a model
-bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, int itype) {
- ggml_type type = GGML_TYPE_Q4_1;
-
- switch (itype) {
- case 2: type = GGML_TYPE_Q4_0; break;
- case 3: type = GGML_TYPE_Q4_1; break;
- case 5: type = GGML_TYPE_Q4_2; break;
- case 6: type = GGML_TYPE_Q4_3; break;
- default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
- };
-
- if (!ggml_is_quantized(type)) {
- fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
- return false;
- }
-
+bool gpt2_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) {
gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
- fout.write((char *) &itype, sizeof(hparams.f16));
+ fout.write((char *) &mtype, sizeof(hparams.f16));
}
// load vocab
}
}
- // load weights
- {
- size_t total_size_org = 0;
- size_t total_size_new = 0;
-
- std::vector<float> work;
-
- std::vector<uint8_t> data_u8;
- std::vector<ggml_fp16_t> data_f16;
- std::vector<float> data_f32;
-
- std::vector<int64_t> hist_all(1 << 4, 0);
-
- while (true) {
- int32_t n_dims;
- int32_t length;
- int32_t ftype;
-
- finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
- finp.read(reinterpret_cast<char *>(&length), sizeof(length));
- finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
-
- if (finp.eof()) {
- break;
- }
-
- int32_t nelements = 1;
- int32_t ne[2] = { 1, 1 };
- for (int i = 0; i < n_dims; ++i) {
- finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
- nelements *= ne[i];
- }
-
- std::string name(length, 0);
- finp.read (&name[0], length);
-
- {
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
- printf("%24s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
- }
-
- // regexes of tensor names to be quantized
- const std::vector<std::string> k_names = {
- "model/wte",
- "model/lm_head",
- "model/h.*/attn/c_attn/w",
- "model/h.*/attn/c_proj/w",
- "model/h.*/mlp/c_fc/w",
- "model/h.*/mlp/c_proj/w",
- };
-
- bool quantize = false;
- for (const auto & s : k_names) {
- if (std::regex_match(name, std::regex(s))) {
- quantize = true;
- break;
- }
- }
-
- if (quantize) {
- if (ftype != 0 && ftype != 1) {
- fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
- return false;
- }
-
- if (ftype == 1) {
- data_f16.resize(nelements);
- finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
- data_f32.resize(nelements);
- for (int i = 0; i < nelements; ++i) {
- data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
- }
- } else {
- data_f32.resize(nelements);
- finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
- }
-
- ftype = itype;
- } else {
- const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
-
- data_u8.resize(nelements*bpe);
- finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
- }
-
- fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
- fout.write(reinterpret_cast<char *>(&length), sizeof(length));
- fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
- for (int i = 0; i < n_dims; ++i) {
- fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
- }
- fout.write(&name[0], length);
-
- if (quantize) {
- printf("quantizing .. ");
- work.resize(nelements); // for quantization
-
- size_t cur_size = 0;
- std::vector<int64_t> hist_cur(1 << 4, 0);
-
- switch (type) {
- case GGML_TYPE_Q4_0:
- {
- cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q4_1:
- {
- cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q4_2:
- {
- cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q4_3:
- {
- cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- default:
- {
- fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
- return false;
- }
- }
-
- fout.write(reinterpret_cast<char *>(work.data()), cur_size);
- total_size_new += cur_size;
-
- printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
- for (int i = 0; i < hist_cur.size(); ++i) {
- hist_all[i] += hist_cur[i];
- }
-
- for (int i = 0; i < hist_cur.size(); ++i) {
- printf("%5.3f ", hist_cur[i] / (float)nelements);
- }
- printf("\n");
- } else {
- printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
- fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
- total_size_new += data_u8.size();
- }
-
- total_size_org += nelements * sizeof(float);
- }
-
- printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
- printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
-
- {
- int64_t sum_all = 0;
- for (int i = 0; i < hist_all.size(); ++i) {
- sum_all += hist_all[i];
- }
+ // regexes of tensor names to be quantized
+ const std::vector<std::string> to_quant = {
+ "model/wte",
+ "model/lm_head",
+ "model/h.*/attn/c_attn/w",
+ "model/h.*/attn/c_proj/w",
+ "model/h.*/mlp/c_fc/w",
+ "model/h.*/mlp/c_proj/w",
+ };
- printf("%s: hist: ", __func__);
- for (int i = 0; i < hist_all.size(); ++i) {
- printf("%5.3f ", hist_all[i] / (float)sum_all);
- }
- printf("\n");
- }
+ if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) {
+ fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
+ return false;
}
finp.close();
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
- const int itype = atoi(argv[3]);
+ const int mtype = atoi(argv[3]);
const int64_t t_main_start_us = ggml_time_us();
{
const int64_t t_start_us = ggml_time_us();
- if (!gpt2_model_quantize(fname_inp, fname_out, itype)) {
+ if (!gpt2_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}
set(TEST_TARGET gpt-j-quantize)
add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
while (true) {
int32_t n_dims;
int32_t length;
- int32_t ftype;
+ int32_t ttype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
+ fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
if (fin.eof()) {
break;
}
- int64_t nelements = 1;
- int64_t ne[2] = { 1, 1 };
+ int32_t nelements = 1;
+ int32_t ne[2] = { 1, 1 };
for (int i = 0; i < n_dims; ++i) {
- int32_t ne_cur;
- fin.read(reinterpret_cast<char *>(&ne_cur), sizeof(ne_cur));
- ne[i] = ne_cur;
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
nelements *= ne[i];
}
}
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld], expected [%lld, %lld]\n",
- __func__, name.data(), tensor->ne[0], tensor->ne[1], ne[0], ne[1]);
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+ __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
return false;
}
+ // for debugging
if (0) {
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
- printf("%24s - [%5lld, %5lld], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+ printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
}
- size_t bpe = 0;
-
- switch (ftype) {
- case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
- case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
- case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
- case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
- case 5: bpe = ggml_type_size(GGML_TYPE_Q4_2); assert(ne[0] % 64 == 0); break;
- case 6: bpe = ggml_type_size(GGML_TYPE_Q4_3); assert(ne[0] % 64 == 0); break;
- default:
- {
- fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
- return false;
- }
- };
+ const size_t bpe = ggml_type_size(ggml_type(ttype));
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %llu\n",
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
return false;
}
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
- //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+ //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ttype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
total_size += ggml_nbytes(tensor);
if (++n_tensors % 8 == 0) {
printf(".");
#include "ggml/ggml.h"
#include "common.h"
+#include "common-ggml.h"
#include <cassert>
#include <cmath>
};
// quantize a model
-bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, int itype) {
- ggml_type type = GGML_TYPE_Q4_1;
-
- switch (itype) {
- case 2: type = GGML_TYPE_Q4_0; break;
- case 3: type = GGML_TYPE_Q4_1; break;
- case 5: type = GGML_TYPE_Q4_2; break;
- case 6: type = GGML_TYPE_Q4_3; break;
- default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
- };
-
- if (!ggml_is_quantized(type)) {
- fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
- return false;
- }
-
+bool gptj_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) {
gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
- fout.write((char *) &itype, sizeof(hparams.f16));
+ fout.write((char *) &mtype, sizeof(hparams.f16));
}
// load vocab
}
}
- // load weights
- {
- size_t total_size_org = 0;
- size_t total_size_new = 0;
-
- std::vector<float> work;
-
- std::vector<uint8_t> data_u8;
- std::vector<ggml_fp16_t> data_f16;
- std::vector<float> data_f32;
-
- std::vector<int64_t> hist_all(1 << 4, 0);
-
- while (true) {
- int32_t n_dims;
- int32_t length;
- int32_t ftype;
-
- finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
- finp.read(reinterpret_cast<char *>(&length), sizeof(length));
- finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
-
- if (finp.eof()) {
- break;
- }
-
- int32_t nelements = 1;
- int32_t ne[2] = { 1, 1 };
- for (int i = 0; i < n_dims; ++i) {
- finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
- nelements *= ne[i];
- }
-
- std::string name(length, 0);
- finp.read (&name[0], length);
-
- {
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
- printf("%48s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
- }
-
- // regexes of tensor names to be quantized
- const std::vector<std::string> k_names = {
- ".*weight",
- };
-
- bool quantize = false;
- for (const auto & s : k_names) {
- if (std::regex_match(name, std::regex(s))) {
- quantize = true;
- break;
- }
- }
-
- // quantize only 2D tensors
- quantize &= (n_dims == 2);
-
- if (quantize) {
- if (ftype != 0 && ftype != 1) {
- fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
- return false;
- }
-
- if (ftype == 1) {
- data_f16.resize(nelements);
- finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
- data_f32.resize(nelements);
- for (int i = 0; i < nelements; ++i) {
- data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
- }
- } else {
- data_f32.resize(nelements);
- finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
- }
-
- ftype = itype;
- } else {
- const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
-
- data_u8.resize(nelements*bpe);
- finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
- }
-
- fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
- fout.write(reinterpret_cast<char *>(&length), sizeof(length));
- fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
- for (int i = 0; i < n_dims; ++i) {
- fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
- }
- fout.write(&name[0], length);
-
- if (quantize) {
- printf("quantizing .. ");
- work.resize(nelements); // for quantization
-
- size_t cur_size = 0;
- std::vector<int64_t> hist_cur(1 << 4, 0);
-
- switch (type) {
- case GGML_TYPE_Q4_0:
- {
- cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q4_1:
- {
- cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q4_2:
- {
- cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q4_3:
- {
- cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- default:
- {
- fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
- return false;
- }
- }
-
- fout.write(reinterpret_cast<char *>(work.data()), cur_size);
- total_size_new += cur_size;
-
- printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
- for (int i = 0; i < hist_cur.size(); ++i) {
- hist_all[i] += hist_cur[i];
- }
-
- for (int i = 0; i < hist_cur.size(); ++i) {
- printf("%5.3f ", hist_cur[i] / (float)nelements);
- }
- printf("\n");
- } else {
- printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
- fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
- total_size_new += data_u8.size();
- }
-
- total_size_org += nelements * sizeof(float);
- }
-
- printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
- printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
-
- {
- int64_t sum_all = 0;
- for (int i = 0; i < hist_all.size(); ++i) {
- sum_all += hist_all[i];
- }
+ // regexes of tensor names to be quantized
+ const std::vector<std::string> to_quant = {
+ ".*weight",
+ };
- printf("%s: hist: ", __func__);
- for (int i = 0; i < hist_all.size(); ++i) {
- printf("%5.3f ", hist_all[i] / (float)sum_all);
- }
- printf("\n");
- }
+ if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) {
+ fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
+ return false;
}
finp.close();
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
- const int itype = atoi(argv[3]);
+ const int mtype = atoi(argv[3]);
const int64_t t_main_start_us = ggml_time_us();
{
const int64_t t_start_us = ggml_time_us();
- if (!gptj_model_quantize(fname_inp, fname_out, itype)) {
+ if (!gptj_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}
set(TEST_TARGET stablelm-quantize)
add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
while (true) {
int32_t n_dims;
int32_t length;
- int32_t ftype;
+ int32_t ttype;
fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
fin.read(reinterpret_cast<char *>(&length), sizeof(length));
- fin.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
+ fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
if (fin.eof()) {
break;
return false;
}
+ // for debugging
if (0) {
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
- printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ftype_str[ftype], ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+ printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.data(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
}
- size_t bpe = 0;
-
- switch (ftype) {
- case 0: bpe = ggml_type_size(GGML_TYPE_F32); break;
- case 1: bpe = ggml_type_size(GGML_TYPE_F16); break;
- case 2: bpe = ggml_type_size(GGML_TYPE_Q4_0); assert(ne[0] % 64 == 0); break;
- case 3: bpe = ggml_type_size(GGML_TYPE_Q4_1); assert(ne[0] % 64 == 0); break;
- case 5: bpe = ggml_type_size(GGML_TYPE_Q4_2); assert(ne[0] % 64 == 0); break;
- case 6: bpe = ggml_type_size(GGML_TYPE_Q4_3); assert(ne[0] % 64 == 0); break;
- default:
- {
- fprintf(stderr, "%s: unknown ftype %d in model file\n", __func__, ftype);
- return false;
- }
- };
+ const size_t bpe = ggml_type_size(ggml_type(ttype));
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
- //printf("%42s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
total_size += ggml_nbytes(tensor);
if (++n_tensors % 8 == 0) {
printf(".");
#include "ggml/ggml.h"
#include "common.h"
+#include "common-ggml.h"
#include <cassert>
#include <cmath>
};
// quantize a model
-bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fname_out, int itype) {
- ggml_type type = GGML_TYPE_Q4_1;
-
- switch (itype) {
- case 2: type = GGML_TYPE_Q4_0; break;
- case 3: type = GGML_TYPE_Q4_1; break;
- case 5: type = GGML_TYPE_Q4_2; break;
- case 6: type = GGML_TYPE_Q4_3; break;
- default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
- };
-
- if (!ggml_is_quantized(type)) {
- fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
- return false;
- }
-
+bool stablelm_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) {
gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
- fout.write((char *) &itype, sizeof(hparams.ftype));
+ fout.write((char *) &mtype, sizeof(hparams.ftype));
}
// load vocab
}
}
- // load weights
- {
- size_t total_size_org = 0;
- size_t total_size_new = 0;
-
- std::vector<float> work;
-
- std::vector<uint8_t> data_u8;
- std::vector<ggml_fp16_t> data_f16;
- std::vector<float> data_f32;
-
- std::vector<int64_t> hist_all(1 << 4, 0);
-
- while (true) {
- int32_t n_dims;
- int32_t length;
- int32_t ftype;
-
- finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
- finp.read(reinterpret_cast<char *>(&length), sizeof(length));
- finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
-
- if (finp.eof()) {
- break;
- }
-
- int32_t nelements = 1;
- int32_t ne[2] = { 1, 1 };
- for (int i = 0; i < n_dims; ++i) {
- finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
- nelements *= ne[i];
- }
-
- std::string name(length, 0);
- finp.read (&name[0], length);
-
- {
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
- printf("%64s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ftype_str[ftype]);
- }
-
- // regexes of tensor names to be quantized
- const std::vector<std::string> k_names = {
- ".*weight",
- };
-
- bool quantize = false;
- for (const auto & s : k_names) {
- if (std::regex_match(name, std::regex(s))) {
- quantize = true;
- break;
- }
- }
-
- // quantize only 2D tensors
- quantize &= (n_dims == 2);
-
- if (quantize) {
- if (ftype != 0 && ftype != 1) {
- fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
- return false;
- }
-
- if (ftype == 1) {
- data_f16.resize(nelements);
- finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
- data_f32.resize(nelements);
- for (int i = 0; i < nelements; ++i) {
- data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
- }
- } else {
- data_f32.resize(nelements);
- finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
- }
-
- ftype = itype;
- } else {
- const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
-
- data_u8.resize(nelements*bpe);
- finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
- }
-
- fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
- fout.write(reinterpret_cast<char *>(&length), sizeof(length));
- fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
- for (int i = 0; i < n_dims; ++i) {
- fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
- }
- fout.write(&name[0], length);
-
- if (quantize) {
- printf("quantizing .. ");
- work.resize(nelements); // for quantization
-
- size_t cur_size = 0;
- std::vector<int64_t> hist_cur(1 << 4, 0);
-
- switch (type) {
- case GGML_TYPE_Q4_0:
- {
- cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q4_1:
- {
- cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q4_2:
- {
- cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q4_3:
- {
- cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- default:
- {
- fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
- return false;
- }
- }
-
- fout.write(reinterpret_cast<char *>(work.data()), cur_size);
- total_size_new += cur_size;
-
- printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
- for (int i = 0; i < hist_cur.size(); ++i) {
- hist_all[i] += hist_cur[i];
- }
-
- for (int i = 0; i < hist_cur.size(); ++i) {
- printf("%5.3f ", hist_cur[i] / (float)nelements);
- }
- printf("\n");
- } else {
- printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
- fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
- total_size_new += data_u8.size();
- }
-
- total_size_org += nelements * sizeof(float);
- }
-
- printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
- printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
-
- {
- int64_t sum_all = 0;
- for (int i = 0; i < hist_all.size(); ++i) {
- sum_all += hist_all[i];
- }
+ // regexes of tensor names to be quantized
+ const std::vector<std::string> to_quant = {
+ ".*weight",
+ };
- printf("%s: hist: ", __func__);
- for (int i = 0; i < hist_all.size(); ++i) {
- printf("%5.3f ", hist_all[i] / (float)sum_all);
- }
- printf("\n");
- }
+ if (!ggml_common_quantize_0(finp, fout, mtype, to_quant, {})) {
+ fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
+ return false;
}
finp.close();
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
- const int itype = atoi(argv[3]);
+ const int mtype = atoi(argv[3]);
const int64_t t_main_start_us = ggml_time_us();
{
const int64_t t_start_us = ggml_time_us();
- if (!stablelm_model_quantize(fname_inp, fname_out, itype)) {
+ if (!stablelm_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}
set(TEST_TARGET whisper-quantize)
add_executable(${TEST_TARGET} quantize.cpp)
-target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
#include "ggml/ggml.h"
#include "common.h"
+#include "common-ggml.h"
#include <cassert>
#include <cmath>
};
// quantize a model
-bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, int itype) {
- ggml_type type = GGML_TYPE_Q4_1;
-
- switch (itype) {
- case 2: type = GGML_TYPE_Q4_0; break;
- case 3: type = GGML_TYPE_Q4_1; break;
- case 5: type = GGML_TYPE_Q4_2; break;
- case 6: type = GGML_TYPE_Q4_3; break;
- default: fprintf(stderr, "%s: invalid quantization type %d\n", __func__, itype); return 1;
- };
-
- if (!ggml_is_quantized(type)) {
- fprintf(stderr, "%s: invalid quantization type %d\n", __func__, type);
- return false;
- }
-
+bool whisper_model_quantize(const std::string & fname_inp, const std::string & fname_out, ggml_mtype mtype) {
gpt_vocab vocab;
printf("%s: loading model from '%s'\n", __func__, fname_inp.c_str());
fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels));
- fout.write((char *) &itype, sizeof(hparams.f16));
+ fout.write((char *) &mtype, sizeof(hparams.f16));
}
// load mel filters
}
}
- // load weights
- {
- size_t total_size_org = 0;
- size_t total_size_new = 0;
-
- std::vector<float> work;
-
- std::vector<uint8_t> data_u8;
- std::vector<ggml_fp16_t> data_f16;
- std::vector<float> data_f32;
-
- std::vector<int64_t> hist_all(1 << 4, 0);
-
- while (true) {
- int32_t n_dims;
- int32_t length;
- int32_t ftype;
-
- finp.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
- finp.read(reinterpret_cast<char *>(&length), sizeof(length));
- finp.read(reinterpret_cast<char *>(&ftype), sizeof(ftype));
-
- if (finp.eof()) {
- break;
- }
-
- int32_t nelements = 1;
- int32_t ne[3] = { 1, 1, 1 };
- for (int i = 0; i < n_dims; ++i) {
- finp.read (reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
- nelements *= ne[i];
- }
-
- std::string name(length, 0);
- finp.read (&name[0], length);
-
- {
- static const char * ftype_str[] = { "f32", "f16", "q4_0", "q4_1", "q4_2", "q4_3" };
- printf("%48s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ftype_str[ftype]);
- }
-
- // regexes of tensor names to not be quantized
- const std::vector<std::string> k_names = {
- //"encoder.*",
- "encoder.conv1.bias",
- "encoder.conv2.bias",
- "encoder.positional_embedding",
- "decoder.positional_embedding",
- };
-
- bool quantize = true;
- for (const auto & s : k_names) {
- if (std::regex_match(name, std::regex(s))) {
- quantize = false;
- break;
- }
- }
-
- // quantize only 2D and 3D tensors
- quantize &= (n_dims == 2);
-
- if (quantize) {
- if (ftype != 0 && ftype != 1) {
- fprintf(stderr, "%s: unsupported ftype %d for integer quantization\n", __func__, ftype);
- return false;
- }
-
- if (ftype == 1) {
- data_f16.resize(nelements);
- finp.read(reinterpret_cast<char *>(data_f16.data()), nelements * sizeof(ggml_fp16_t));
- data_f32.resize(nelements);
- for (int i = 0; i < nelements; ++i) {
- data_f32[i] = ggml_fp16_to_fp32(data_f16[i]);
- }
- } else {
- data_f32.resize(nelements);
- finp.read(reinterpret_cast<char *>(data_f32.data()), nelements * sizeof(float));
- }
-
- ftype = itype;
- } else {
- const int bpe = (ftype == 0) ? sizeof(float) : sizeof(uint16_t);
-
- data_u8.resize(nelements*bpe);
- finp.read(reinterpret_cast<char *>(data_u8.data()), nelements * bpe);
- }
-
- fout.write(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
- fout.write(reinterpret_cast<char *>(&length), sizeof(length));
- fout.write(reinterpret_cast<char *>(&ftype), sizeof(ftype));
- for (int i = 0; i < n_dims; ++i) {
- fout.write(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
- }
- fout.write(&name[0], length);
-
- if (quantize) {
- printf("quantizing .. ");
- work.resize(nelements); // for quantization
-
- size_t cur_size = 0;
- std::vector<int64_t> hist_cur(1 << 4, 0);
-
- switch (type) {
- case GGML_TYPE_Q4_0:
- {
- cur_size = ggml_quantize_q4_0(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q4_1:
- {
- cur_size = ggml_quantize_q4_1(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q4_2:
- {
- cur_size = ggml_quantize_q4_2(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- case GGML_TYPE_Q4_3:
- {
- cur_size = ggml_quantize_q4_3(data_f32.data(), work.data(), nelements, ne[0], hist_cur.data());
- } break;
- default:
- {
- fprintf(stderr, "%s: unsupported quantization type %d\n", __func__, type);
- return false;
- }
- }
-
- fout.write(reinterpret_cast<char *>(work.data()), cur_size);
- total_size_new += cur_size;
-
- printf("size = %8.3f MB -> %8.3f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0);
- for (int i = 0; i < hist_cur.size(); ++i) {
- hist_all[i] += hist_cur[i];
- }
-
- for (int i = 0; i < hist_cur.size(); ++i) {
- printf("%5.3f ", hist_cur[i] / (float)nelements);
- }
- printf("\n");
- } else {
- printf("size = %8.3f MB\n", data_u8.size()/1024.0/1024.0);
- fout.write(reinterpret_cast<char *>(data_u8.data()), data_u8.size());
- total_size_new += data_u8.size();
- }
-
- total_size_org += nelements * sizeof(float);
- }
-
- printf("%s: model size = %8.2f MB\n", __func__, total_size_org/1024.0/1024.0);
- printf("%s: quant size = %8.2f MB\n", __func__, total_size_new/1024.0/1024.0);
-
- {
- int64_t sum_all = 0;
- for (int i = 0; i < hist_all.size(); ++i) {
- sum_all += hist_all[i];
- }
+ // regexes of tensor names to not be quantized
+ const std::vector<std::string> to_skip = {
+ //"encoder.*",
+ "encoder.conv1.bias",
+ "encoder.conv2.bias",
+ "encoder.positional_embedding",
+ "decoder.positional_embedding",
+ };
- printf("%s: hist: ", __func__);
- for (int i = 0; i < hist_all.size(); ++i) {
- printf("%5.3f ", hist_all[i] / (float)sum_all);
- }
- printf("\n");
- }
+ if (!ggml_common_quantize_0(finp, fout, mtype, { ".*" }, to_skip)) {
+ fprintf(stderr, "%s: failed to quantize model '%s'\n", __func__, fname_inp.c_str());
+ return false;
}
finp.close();
return true;
}
-// usage:
-// ./gpt-2-quantize models/gpt-2-117M/ggml-model.bin models/gpt-2-117M/ggml-model-quant.bin type
-//
int main(int argc, char ** argv) {
if (argc != 4) {
fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]);
const std::string fname_inp = argv[1];
const std::string fname_out = argv[2];
- const int itype = atoi(argv[3]);
+ const int mtype = atoi(argv[3]);
const int64_t t_main_start_us = ggml_time_us();
{
const int64_t t_start_us = ggml_time_us();
- if (!whisper_model_quantize(fname_inp, fname_out, itype)) {
+ if (!whisper_model_quantize(fname_inp, fname_out, ggml_mtype(mtype))) {
fprintf(stderr, "%s: failed to quantize model from '%s'\n", __func__, fname_inp.c_str());
return 1;
}