fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
+ const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
+ printf("%s: qntvr = %d\n", __func__, qntvr);
+
+ hparams.ftype %= GGML_QNT_VERSION_FACTOR;
}
// load vocab
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
- printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
- printf("%s: n_head = %d\n", __func__, hparams.n_head);
- printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
- printf("%s: ftype = %d\n", __func__, hparams.ftype);
+ const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+ const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+ printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
+ printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
+ printf("%s: n_head = %d\n", __func__, hparams.n_head);
+ printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+ printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+ printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+ printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+ printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
- fout.write((char *) &ftype, sizeof(hparams.ftype));
+ fout.write((char *) &ftype_dst, sizeof(ftype_dst));
}
// load vocab
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
+ const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
+ printf("%s: qntvr = %d\n", __func__, qntvr);
+
+ hparams.ftype %= GGML_QNT_VERSION_FACTOR;
}
// load vocab
int32_t n_embd = 768;
int32_t n_head = 12;
int32_t n_layer = 12;
- int32_t f16 = 1;
+ int32_t ftype = 1;
};
// quantize a model
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
- finp.read((char *) &hparams.f16, sizeof(hparams.f16));
+ finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
- printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
- printf("%s: n_head = %d\n", __func__, hparams.n_head);
- printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
- printf("%s: f16 = %d\n", __func__, hparams.f16);
+ const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+ const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+ printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
+ printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
+ printf("%s: n_head = %d\n", __func__, hparams.n_head);
+ printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+ printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+ printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+ printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+ printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
- fout.write((char *) &ftype, sizeof(hparams.f16));
+ fout.write((char *) &ftype_dst, sizeof(ftype_dst));
}
// load vocab
fin.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
+ const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
+ printf("%s: qntvr = %d\n", __func__, qntvr);
+
+ hparams.ftype %= GGML_QNT_VERSION_FACTOR;
}
// load vocab
int32_t n_head = 16;
int32_t n_layer = 28;
int32_t n_rot = 64;
- int32_t f16 = 1;
+ int32_t ftype = 1;
};
// quantize a model
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
finp.read((char *) &hparams.n_rot, sizeof(hparams.n_rot));
- finp.read((char *) &hparams.f16, sizeof(hparams.f16));
+ finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
- printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
- printf("%s: n_head = %d\n", __func__, hparams.n_head);
- printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
- printf("%s: f16 = %d\n", __func__, hparams.f16);
+ const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+ const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+ printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
+ printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
+ printf("%s: n_head = %d\n", __func__, hparams.n_head);
+ printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+ printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+ printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+ printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+ printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
- fout.write((char *) &ftype, sizeof(hparams.f16));
+ fout.write((char *) &ftype_dst, sizeof(ftype_dst));
}
// load vocab
fin.read((char *) &hparams.par_res, sizeof(hparams.par_res));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
+ const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_rot = %d\n", __func__, hparams.n_rot);
printf("%s: par_res = %d\n", __func__, hparams.par_res);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
+ printf("%s: qntvr = %d\n", __func__, qntvr);
+
+ hparams.ftype %= GGML_QNT_VERSION_FACTOR;
}
// load vocab
finp.read((char *) &hparams.par_res, sizeof(hparams.par_res));
finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
- printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
- printf("%s: n_head = %d\n", __func__, hparams.n_head);
- printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
- printf("%s: par_res = %d\n", __func__, hparams.par_res);
- printf("%s: ftype = %d\n", __func__, hparams.ftype);
+ const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+ const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+ printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
+ printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
+ printf("%s: n_head = %d\n", __func__, hparams.n_head);
+ printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+ printf("%s: par_res = %d\n", __func__, hparams.par_res);
+ printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+ printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+ printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+ printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fout.write((char *) &hparams.n_rot, sizeof(hparams.n_rot));
fout.write((char *) &hparams.par_res, sizeof(hparams.par_res));
- fout.write((char *) &ftype, sizeof(hparams.ftype));
+ fout.write((char *) &ftype_dst, sizeof(ftype_dst));
}
// load vocab
fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
+ const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
printf("%s: n_head = %d\n", __func__, hparams.n_head);
printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
printf("%s: ftype = %d\n", __func__, hparams.ftype);
+ printf("%s: qntvr = %d\n", __func__, qntvr);
+
+ hparams.ftype %= GGML_QNT_VERSION_FACTOR;
}
// load vocab
int32_t n_embd = 2048;
int32_t n_head = 16;
int32_t n_layer = 24;
- int32_t f16 = 1;
+ int32_t ftype = 1;
};
// quantize a model
finp.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
finp.read((char *) &hparams.n_head, sizeof(hparams.n_head));
finp.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
- finp.read((char *) &hparams.f16, sizeof(hparams.f16));
+ finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
- printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
- printf("%s: n_head = %d\n", __func__, hparams.n_head);
- printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
- printf("%s: f16 = %d\n", __func__, hparams.f16);
+ const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+ const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
+
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+ printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
+ printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
+ printf("%s: n_head = %d\n", __func__, hparams.n_head);
+ printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+ printf("%s: ftype (src) = %d\n", __func__, hparams.ftype);
+ printf("%s: qntvr (src) = %d\n", __func__, qntvr_src);
+ printf("%s: ftype (dst) = %d\n", __func__, ftype_dst);
+ printf("%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
fout.write((char *) &hparams.n_embd, sizeof(hparams.n_embd));
fout.write((char *) &hparams.n_head, sizeof(hparams.n_head));
fout.write((char *) &hparams.n_layer, sizeof(hparams.n_layer));
- fout.write((char *) &ftype, sizeof(hparams.f16));
+ fout.write((char *) &ftype_dst, sizeof(ftype_dst));
}
// load vocab
int32_t n_text_head = 6;
int32_t n_text_layer = 4;
int32_t n_mels = 80;
- int32_t f16 = 1;
+ int32_t ftype = 1;
};
struct whisper_filters {
finp.read((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
finp.read((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
finp.read((char *) &hparams.n_mels, sizeof(hparams.n_mels));
- finp.read((char *) &hparams.f16, sizeof(hparams.f16));
+ finp.read((char *) &hparams.ftype, sizeof(hparams.ftype));
+
+ const int32_t qntvr_src = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+ const int32_t ftype_dst = GGML_QNT_VERSION * GGML_QNT_VERSION_FACTOR + ftype;
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
- fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
+ fprintf(stderr, "%s: ftype (src) = %d\n", __func__, hparams.ftype);
+ fprintf(stderr, "%s: qntvr (src) = %d\n", __func__, qntvr_src);
+ fprintf(stderr, "%s: ftype (dst) = %d\n", __func__, ftype_dst);
+ fprintf(stderr, "%s: qntvr (dst) = %d\n", __func__, GGML_QNT_VERSION);
fout.write((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
fout.write((char *) &hparams.n_audio_ctx, sizeof(hparams.n_audio_ctx));
fout.write((char *) &hparams.n_text_head, sizeof(hparams.n_text_head));
fout.write((char *) &hparams.n_text_layer, sizeof(hparams.n_text_layer));
fout.write((char *) &hparams.n_mels, sizeof(hparams.n_mels));
- fout.write((char *) &ftype, sizeof(hparams.f16));
+ fout.write((char *) &ftype, sizeof(hparams.ftype));
}
// load mel filters
model.type = e_model::MODEL_LARGE;
}
+ hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+
// for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation
wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
const size_t scale = model.hparams.ftype ? 1 : 2;
+ const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
+ fprintf(stderr, "%s: qntvr = %d\n", __func__, qntvr);
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
// print memory requirements
#define GGML_FILE_MAGIC 0x67676d6c // "ggml"
#define GGML_FILE_VERSION 1
+#define GGML_QNT_VERSION 0 // bump this on quantization format changes
+#define GGML_QNT_VERSION_FACTOR 1000 // do not change this
+
#define GGML_MAX_DIMS 4
#define GGML_MAX_NODES 4096
#define GGML_MAX_PARAMS 16