From: Georgi Gerganov Date: Sat, 20 May 2023 15:01:40 +0000 (+0300) Subject: examples : add quantize version to MPT and Replit examples (ref #168) X-Git-Tag: upstream/0.0.1642~1455 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=c2fab8a3503b6e6fbf480be993f24c21951d3af0;p=pkg%2Fggml%2Fsources%2Fggml examples : add quantize version to MPT and Replit examples (ref #168) --- diff --git a/examples/mpt/main.cpp b/examples/mpt/main.cpp index d6825310..2041d931 100644 --- a/examples/mpt/main.cpp +++ b/examples/mpt/main.cpp @@ -22,14 +22,14 @@ int n_ctx = 4096; // no defaults for now struct mpt_hparams { - int32_t d_model = 0; - int32_t max_seq_len = 0; - int32_t n_heads = 0; - int32_t n_layers = 0; - int32_t n_vocab = 0; + int32_t d_model = 0; + int32_t max_seq_len = 0; + int32_t n_heads = 0; + int32_t n_layers = 0; + int32_t n_vocab = 0; float alibi_bias_max = 0; - float clip_qkv = 0; - int32_t ftype = 0; + float clip_qkv = 0; + int32_t ftype = 0; }; struct mpt_layer { @@ -88,14 +88,16 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo { auto & hparams = model.hparams; - fin.read((char *)&hparams.d_model, sizeof(hparams.d_model)); - fin.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len)); - fin.read((char *)&hparams.n_heads, sizeof(hparams.n_heads)); - fin.read((char *)&hparams.n_layers, sizeof(hparams.n_layers)); - fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab)); - fin.read((char *)&hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max)); - fin.read((char *)&hparams.clip_qkv, sizeof(hparams.clip_qkv)); - fin.read((char *)&hparams.ftype, sizeof(hparams.ftype)); + fin.read((char *) &hparams.d_model, sizeof(hparams.d_model)); + fin.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len)); + fin.read((char *) &hparams.n_heads, sizeof(hparams.n_heads)); + fin.read((char *) &hparams.n_layers, sizeof(hparams.n_layers)); + fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fin.read((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max)); + fin.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; printf("%s: d_model = %d\n", __func__, hparams.d_model); printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len); @@ -105,6 +107,9 @@ bool mpt_model_load(const std::string & fname, mpt_model & model, gpt_vocab & vo printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max); printf("%s: clip_qkv = %f\n", __func__, hparams.clip_qkv); printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; } // load vocab diff --git a/examples/mpt/quantize.cpp b/examples/mpt/quantize.cpp index 8f32bdd4..95b83c36 100644 --- a/examples/mpt/quantize.cpp +++ b/examples/mpt/quantize.cpp @@ -14,14 +14,14 @@ #include struct mpt_hparams { - int32_t d_model = 0; - int32_t max_seq_len = 0; - int32_t n_heads = 0; - int32_t n_layers = 0; - int32_t n_vocab = 0; + int32_t d_model = 0; + int32_t max_seq_len = 0; + int32_t n_heads = 0; + int32_t n_layers = 0; + int32_t n_vocab = 0; float alibi_bias_max = 0; - float clip_qkv = 0; - int32_t ftype = 0; + float clip_qkv = 0; + int32_t ftype = 0; }; // quantize a model @@ -61,32 +61,38 @@ bool mpt_model_quantize(const std::string & fname_inp, // load hparams { - finp.read((char *)&hparams.d_model, sizeof(hparams.d_model)); - finp.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len)); - finp.read((char *)&hparams.n_heads, sizeof(hparams.n_heads)); - finp.read((char *)&hparams.n_layers, sizeof(hparams.n_layers)); - finp.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab)); - finp.read((char *)&hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max)); - finp.read((char *)&hparams.clip_qkv, sizeof(hparams.clip_qkv)); - finp.read((char *)&hparams.ftype, sizeof(hparams.ftype)); - - printf("%s: d_model = %d\n", __func__, hparams.d_model); - printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len); - printf("%s: n_heads = %d\n", __func__, hparams.n_heads); - printf("%s: n_layers = %d\n", __func__, hparams.n_layers); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + finp.read((char *) &hparams.d_model, sizeof(hparams.d_model)); + finp.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len)); + finp.read((char *) &hparams.n_heads, sizeof(hparams.n_heads)); + finp.read((char *) &hparams.n_layers, sizeof(hparams.n_layers)); + finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + finp.read((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max)); + finp.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv)); + finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; + const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; + + printf("%s: d_model = %d\n", __func__, hparams.d_model); + printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len); + printf("%s: n_heads = %d\n", __func__, hparams.n_heads); + printf("%s: n_layers = %d\n", __func__, hparams.n_layers); + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max); - printf("%s: clip_qkv = %f\n", __func__, hparams.clip_qkv); - printf("%s: ftype = %d\n", __func__, hparams.ftype); - - fout.write((char *)&hparams.d_model, sizeof(hparams.d_model)); - fout.write((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len)); - fout.write((char *)&hparams.n_heads, sizeof(hparams.n_heads)); - fout.write((char *)&hparams.n_layers, sizeof(hparams.n_layers)); - fout.write((char *)&hparams.n_vocab, sizeof(hparams.n_vocab)); - fout.write((char *)&hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max)); - fout.write((char *)&hparams.clip_qkv, sizeof(hparams.clip_qkv)); - fout.write((char *)&ftype, sizeof(hparams.ftype)); + printf("%s: clip_qkv = %f\n", __func__, hparams.clip_qkv); + printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); + printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); + printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); + printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); + + fout.write((char *) &hparams.d_model, sizeof(hparams.d_model)); + fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len)); + fout.write((char *) &hparams.n_heads, sizeof(hparams.n_heads)); + fout.write((char *) &hparams.n_layers, sizeof(hparams.n_layers)); + fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fout.write((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max)); + fout.write((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv)); + fout.write((char *) &ftype_dst, sizeof(ftype_dst)); } // load vocab diff --git a/examples/replit/main.cpp b/examples/replit/main.cpp index 2f056c23..cd678a49 100644 --- a/examples/replit/main.cpp +++ b/examples/replit/main.cpp @@ -129,12 +129,12 @@ std::string replit_tokenizer_detokenize(replit_tokenizer & tokenizer, const std: // no defaults for now struct mpt_hparams { - int32_t d_model = 0; + int32_t d_model = 0; int32_t max_seq_len = 0; - int32_t n_heads = 0; - int32_t n_layers = 0; - int32_t n_vocab = 0; - int32_t ftype = 0; + int32_t n_heads = 0; + int32_t n_layers = 0; + int32_t n_vocab = 0; + int32_t ftype = 0; }; struct replit_layer { @@ -195,19 +195,24 @@ bool replit_model_load(const std::string & fname, replit_model & model, replit_t { auto & hparams = model.hparams; - fin.read((char *)&hparams.d_model, sizeof(hparams.d_model)); - fin.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len)); - fin.read((char *)&hparams.n_heads, sizeof(hparams.n_heads)); - fin.read((char *)&hparams.n_layers, sizeof(hparams.n_layers)); - fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab)); - fin.read((char *)&hparams.ftype, sizeof(hparams.ftype)); - - printf("%s: d_model = %d\n", __func__, hparams.d_model); - printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len); - printf("%s: n_heads = %d\n", __func__, hparams.n_heads); - printf("%s: n_layers = %d\n", __func__, hparams.n_layers); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: ftype = %d\n", __func__, hparams.ftype); + fin.read((char *) &hparams.d_model, sizeof(hparams.d_model)); + fin.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len)); + fin.read((char *) &hparams.n_heads, sizeof(hparams.n_heads)); + fin.read((char *) &hparams.n_layers, sizeof(hparams.n_layers)); + fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + + printf("%s: d_model = %d\n", __func__, hparams.d_model); + printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len); + printf("%s: n_heads = %d\n", __func__, hparams.n_heads); + printf("%s: n_layers = %d\n", __func__, hparams.n_layers); + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; } // load vocab diff --git a/examples/replit/quantize.cpp b/examples/replit/quantize.cpp index 40a58060..9a4ec433 100644 --- a/examples/replit/quantize.cpp +++ b/examples/replit/quantize.cpp @@ -14,12 +14,12 @@ #include struct mpt_hparams { - int32_t d_model = 0; + int32_t d_model = 0; int32_t max_seq_len = 0; - int32_t n_heads = 0; - int32_t n_layers = 0; - int32_t n_vocab = 0; - int32_t ftype = 0; + int32_t n_heads = 0; + int32_t n_layers = 0; + int32_t n_vocab = 0; + int32_t ftype = 0; }; // quantize a model @@ -59,26 +59,32 @@ bool mpt_model_quantize(const std::string & fname_inp, // load hparams { - finp.read((char *)&hparams.d_model, sizeof(hparams.d_model)); - finp.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len)); - finp.read((char *)&hparams.n_heads, sizeof(hparams.n_heads)); - finp.read((char *)&hparams.n_layers, sizeof(hparams.n_layers)); - finp.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab)); - finp.read((char *)&hparams.ftype, sizeof(hparams.ftype)); - - printf("%s: d_model = %d\n", __func__, hparams.d_model); + finp.read((char *) &hparams.d_model, sizeof(hparams.d_model)); + finp.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len)); + finp.read((char *) &hparams.n_heads, sizeof(hparams.n_heads)); + finp.read((char *) &hparams.n_layers, sizeof(hparams.n_layers)); + finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + finp.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR; + const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype; + + printf("%s: d_model = %d\n", __func__, hparams.d_model); printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len); - printf("%s: n_heads = %d\n", __func__, hparams.n_heads); - printf("%s: n_layers = %d\n", __func__, hparams.n_layers); - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: ftype = %d\n", __func__, hparams.ftype); - - fout.write((char *)&hparams.d_model, sizeof(hparams.d_model)); - fout.write((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len)); - fout.write((char *)&hparams.n_heads, sizeof(hparams.n_heads)); - fout.write((char *)&hparams.n_layers, sizeof(hparams.n_layers)); - fout.write((char *)&hparams.n_vocab, sizeof(hparams.n_vocab)); - fout.write((char *)&ftype, sizeof(hparams.ftype)); + printf("%s: n_heads = %d\n", __func__, hparams.n_heads); + printf("%s: n_layers = %d\n", __func__, hparams.n_layers); + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: ftype (src) = %d\n", __func__, hparams.ftype); + printf("%s: qntvr (src) = %d\n", __func__, qntvr_src); + printf("%s: ftype (dst) = %d\n", __func__, ftype_dst); + printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION); + + fout.write((char *) &hparams.d_model, sizeof(hparams.d_model)); + fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len)); + fout.write((char *) &hparams.n_heads, sizeof(hparams.n_heads)); + fout.write((char *) &hparams.n_layers, sizeof(hparams.n_layers)); + fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fout.write((char *) &ftype_dst, sizeof(ftype_dst)); } // load vocab