// no defaults for now
struct mpt_hparams {
- int32_t d_model = 0;
- int32_t max_seq_len = 0;
- int32_t n_heads = 0;
- int32_t n_layers = 0;
- int32_t n_vocab = 0;
+ int32_t d_model = 0;
+ int32_t max_seq_len = 0;
+ int32_t n_heads = 0;
+ int32_t n_layers = 0;
+ int32_t n_vocab = 0;
float alibi_bias_max = 0;
- float clip_qkv = 0;
- int32_t ftype = 0;
+ float clip_qkv = 0;
+ int32_t ftype = 0;
};
struct mpt_layer {
{
auto & hparams = model.hparams;
- fin.read((char *)&hparams.d_model, sizeof(hparams.d_model));
- fin.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
- fin.read((char *)&hparams.n_heads, sizeof(hparams.n_heads));
- fin.read((char *)&hparams.n_layers, sizeof(hparams.n_layers));
- fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
- fin.read((char *)&hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
- fin.read((char *)&hparams.clip_qkv, sizeof(hparams.clip_qkv));
- fin.read((char *)&hparams.ftype, sizeof(hparams.ftype));
+ fin.read((char *) &hparams.d_model, sizeof(hparams.d_model));
+ fin.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
+ fin.read((char *) &hparams.n_heads, sizeof(hparams.n_heads));
+ fin.read((char *) &hparams.n_layers, sizeof(hparams.n_layers));
+ fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+ fin.read((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
+ fin.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv));
+ fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
+
+ const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
printf("%s: d_model = %d\n", __func__, hparams.d_model);
printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max);
printf("%s: clip_qkv = %f\n", __func__, hparams.clip_qkv);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
+ printf("%s: qntvr = %d\n", __func__, qntvr);
+
+ hparams.ftype %= GGML_QNT_VERSION_FACTOR;
}
// load vocab
#include <vector>
struct mpt_hparams {
- int32_t d_model = 0;
- int32_t max_seq_len = 0;
- int32_t n_heads = 0;
- int32_t n_layers = 0;
- int32_t n_vocab = 0;
+ int32_t d_model = 0;
+ int32_t max_seq_len = 0;
+ int32_t n_heads = 0;
+ int32_t n_layers = 0;
+ int32_t n_vocab = 0;
float alibi_bias_max = 0;
- float clip_qkv = 0;
- int32_t ftype = 0;
+ float clip_qkv = 0;
+ int32_t ftype = 0;
};
// quantize a model
// load hparams
{
- finp.read((char *)&hparams.d_model, sizeof(hparams.d_model));
- finp.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
- finp.read((char *)&hparams.n_heads, sizeof(hparams.n_heads));
- finp.read((char *)&hparams.n_layers, sizeof(hparams.n_layers));
- finp.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
- finp.read((char *)&hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
- finp.read((char *)&hparams.clip_qkv, sizeof(hparams.clip_qkv));
- finp.read((char *)&hparams.ftype, sizeof(hparams.ftype));
-
- printf("%s: d_model = %d\n", __func__, hparams.d_model);
- printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
- printf("%s: n_heads = %d\n", __func__, hparams.n_heads);
- printf("%s: n_layers = %d\n", __func__, hparams.n_layers);
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+ finp.read((char *) &hparams.d_model, sizeof(hparams.d_model));
+ finp.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
+ finp.read((char *) &hparams.n_heads, sizeof(hparams.n_heads));
+ finp.read((char *) &hparams.n_layers, sizeof(hparams.n_layers));
+ finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+ finp.read((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
+ finp.read((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv));
+ finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
+
+ const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+ const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+ printf("%s: d_model = %d\n", __func__, hparams.d_model);
+ printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
+ printf("%s: n_heads = %d\n", __func__, hparams.n_heads);
+ printf("%s: n_layers = %d\n", __func__, hparams.n_layers);
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: alibi_bias_max = %f\n", __func__, hparams.alibi_bias_max);
- printf("%s: clip_qkv = %f\n", __func__, hparams.clip_qkv);
- printf("%s: ftype = %d\n", __func__, hparams.ftype);
-
- fout.write((char *)&hparams.d_model, sizeof(hparams.d_model));
- fout.write((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
- fout.write((char *)&hparams.n_heads, sizeof(hparams.n_heads));
- fout.write((char *)&hparams.n_layers, sizeof(hparams.n_layers));
- fout.write((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
- fout.write((char *)&hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
- fout.write((char *)&hparams.clip_qkv, sizeof(hparams.clip_qkv));
- fout.write((char *)&ftype, sizeof(hparams.ftype));
+ printf("%s: clip_qkv = %f\n", __func__, hparams.clip_qkv);
+ printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+ printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+ printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+ printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
+
+ fout.write((char *) &hparams.d_model, sizeof(hparams.d_model));
+ fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
+ fout.write((char *) &hparams.n_heads, sizeof(hparams.n_heads));
+ fout.write((char *) &hparams.n_layers, sizeof(hparams.n_layers));
+ fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+ fout.write((char *) &hparams.alibi_bias_max, sizeof(hparams.alibi_bias_max));
+ fout.write((char *) &hparams.clip_qkv, sizeof(hparams.clip_qkv));
+ fout.write((char *) &ftype_dst, sizeof(ftype_dst));
}
// load vocab
// no defaults for now
struct mpt_hparams {
- int32_t d_model = 0;
+ int32_t d_model = 0;
int32_t max_seq_len = 0;
- int32_t n_heads = 0;
- int32_t n_layers = 0;
- int32_t n_vocab = 0;
- int32_t ftype = 0;
+ int32_t n_heads = 0;
+ int32_t n_layers = 0;
+ int32_t n_vocab = 0;
+ int32_t ftype = 0;
};
struct replit_layer {
{
auto & hparams = model.hparams;
- fin.read((char *)&hparams.d_model, sizeof(hparams.d_model));
- fin.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
- fin.read((char *)&hparams.n_heads, sizeof(hparams.n_heads));
- fin.read((char *)&hparams.n_layers, sizeof(hparams.n_layers));
- fin.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
- fin.read((char *)&hparams.ftype, sizeof(hparams.ftype));
-
- printf("%s: d_model = %d\n", __func__, hparams.d_model);
- printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
- printf("%s: n_heads = %d\n", __func__, hparams.n_heads);
- printf("%s: n_layers = %d\n", __func__, hparams.n_layers);
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
- printf("%s: ftype = %d\n", __func__, hparams.ftype);
+ fin.read((char *) &hparams.d_model, sizeof(hparams.d_model));
+ fin.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
+ fin.read((char *) &hparams.n_heads, sizeof(hparams.n_heads));
+ fin.read((char *) &hparams.n_layers, sizeof(hparams.n_layers));
+ fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+ fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
+
+ const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+ printf("%s: d_model = %d\n", __func__, hparams.d_model);
+ printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
+ printf("%s: n_heads = %d\n", __func__, hparams.n_heads);
+ printf("%s: n_layers = %d\n", __func__, hparams.n_layers);
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+ printf("%s: ftype = %d\n", __func__, hparams.ftype);
+ printf("%s: qntvr = %d\n", __func__, qntvr);
+
+ hparams.ftype %= GGML_QNT_VERSION_FACTOR;
}
// load vocab
#include <vector>
struct mpt_hparams {
- int32_t d_model = 0;
+ int32_t d_model = 0;
int32_t max_seq_len = 0;
- int32_t n_heads = 0;
- int32_t n_layers = 0;
- int32_t n_vocab = 0;
- int32_t ftype = 0;
+ int32_t n_heads = 0;
+ int32_t n_layers = 0;
+ int32_t n_vocab = 0;
+ int32_t ftype = 0;
};
// quantize a model
// load hparams
{
- finp.read((char *)&hparams.d_model, sizeof(hparams.d_model));
- finp.read((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
- finp.read((char *)&hparams.n_heads, sizeof(hparams.n_heads));
- finp.read((char *)&hparams.n_layers, sizeof(hparams.n_layers));
- finp.read((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
- finp.read((char *)&hparams.ftype, sizeof(hparams.ftype));
-
- printf("%s: d_model = %d\n", __func__, hparams.d_model);
+ finp.read((char *) &hparams.d_model, sizeof(hparams.d_model));
+ finp.read((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
+ finp.read((char *) &hparams.n_heads, sizeof(hparams.n_heads));
+ finp.read((char *) &hparams.n_layers, sizeof(hparams.n_layers));
+ finp.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+ finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
+
+ const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+ const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+ printf("%s: d_model = %d\n", __func__, hparams.d_model);
printf("%s: max_seq_len = %d\n", __func__, hparams.max_seq_len);
- printf("%s: n_heads = %d\n", __func__, hparams.n_heads);
- printf("%s: n_layers = %d\n", __func__, hparams.n_layers);
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
- printf("%s: ftype = %d\n", __func__, hparams.ftype);
-
- fout.write((char *)&hparams.d_model, sizeof(hparams.d_model));
- fout.write((char *)&hparams.max_seq_len, sizeof(hparams.max_seq_len));
- fout.write((char *)&hparams.n_heads, sizeof(hparams.n_heads));
- fout.write((char *)&hparams.n_layers, sizeof(hparams.n_layers));
- fout.write((char *)&hparams.n_vocab, sizeof(hparams.n_vocab));
- fout.write((char *)&ftype, sizeof(hparams.ftype));
+ printf("%s: n_heads = %d\n", __func__, hparams.n_heads);
+ printf("%s: n_layers = %d\n", __func__, hparams.n_layers);
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+ printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+ printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+ printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+ printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
+
+ fout.write((char *) &hparams.d_model, sizeof(hparams.d_model));
+ fout.write((char *) &hparams.max_seq_len, sizeof(hparams.max_seq_len));
+ fout.write((char *) &hparams.n_heads, sizeof(hparams.n_heads));
+ fout.write((char *) &hparams.n_layers, sizeof(hparams.n_layers));
+ fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+ fout.write((char *) &ftype_dst, sizeof(ftype_dst));
}
// load vocab