};
template<typename T>
-static void read_safe(std::ifstream& fin, T& dest) {
- fin.read((char*)& dest, sizeof(T));
+static void read_safe(whisper_model_loader * loader, T & dest) {
+ loader->read(loader->context, &dest, sizeof(T));
}
// load the model from a ggml file
//
// see the convert-pt-to-ggml.py script for details
//
-static bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
- fprintf(stderr, "%s: loading model from '%s'\n", __func__, fname.c_str());
+static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
+ fprintf(stderr, "%s: loading model\n", __func__);
auto & model = wctx.model;
auto & vocab = wctx.vocab;
- auto fin = std::ifstream(fname, std::ios::binary);
- if (!fin) {
- fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
- return false;
- }
-
// verify magic
{
uint32_t magic;
- read_safe(fin, magic);
+ read_safe(loader, magic);
if (magic != 0x67676d6c) {
- fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+ fprintf(stderr, "%s: invalid model data (bad magic)\n", __func__);
return false;
}
}
{
auto & hparams = model.hparams;
- read_safe(fin, hparams.n_vocab);
- read_safe(fin, hparams.n_audio_ctx);
- read_safe(fin, hparams.n_audio_state);
- read_safe(fin, hparams.n_audio_head);
- read_safe(fin, hparams.n_audio_layer);
- read_safe(fin, hparams.n_text_ctx);
- read_safe(fin, hparams.n_text_state);
- read_safe(fin, hparams.n_text_head);
- read_safe(fin, hparams.n_text_layer);
- read_safe(fin, hparams.n_mels);
- read_safe(fin, hparams.f16);
+ read_safe(loader, hparams.n_vocab);
+ read_safe(loader, hparams.n_audio_ctx);
+ read_safe(loader, hparams.n_audio_state);
+ read_safe(loader, hparams.n_audio_head);
+ read_safe(loader, hparams.n_audio_layer);
+ read_safe(loader, hparams.n_text_ctx);
+ read_safe(loader, hparams.n_text_state);
+ read_safe(loader, hparams.n_text_head);
+ read_safe(loader, hparams.n_text_layer);
+ read_safe(loader, hparams.n_mels);
+ read_safe(loader, hparams.f16);
assert(hparams.n_text_state == hparams.n_audio_state);
{
auto & filters = wctx.model.filters;
- read_safe(fin, filters.n_mel);
- read_safe(fin, filters.n_fft);
+ read_safe(loader, filters.n_mel);
+ read_safe(loader, filters.n_fft);
filters.data.resize(filters.n_mel * filters.n_fft);
- fin.read((char *) filters.data.data(), filters.data.size() * sizeof(float));
+ loader->read(loader->context, filters.data.data(), filters.data.size() * sizeof(float));
}
// load vocab
{
int32_t n_vocab = 0;
- read_safe(fin, n_vocab);
+ read_safe(loader, n_vocab);
//if (n_vocab != model.hparams.n_vocab) {
// fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
for (int i = 0; i < n_vocab; i++) {
uint32_t len;
- read_safe(fin, len);
+ read_safe(loader, len);
if (len > 0) {
tmp.resize(len);
- fin.read(&tmp[0], tmp.size()); // read to buffer
+ loader->read(loader->context, &tmp[0], tmp.size()); // read to buffer
word.assign(&tmp[0], tmp.size());
} else {
// seems like we have an empty-string token in multi-language models (i = 50256)
int32_t length;
int32_t ftype;
- read_safe(fin, n_dims);
- read_safe(fin, length);
- read_safe(fin, ftype);
+ read_safe(loader, n_dims);
+ read_safe(loader, length);
+ read_safe(loader, ftype);
- if (fin.eof()) {
+ if (loader->eof(loader->context)) {
break;
}
int32_t nelements = 1;
int32_t ne[3] = { 1, 1, 1 };
for (int i = 0; i < n_dims; ++i) {
- read_safe(fin, ne[i]);
+ read_safe(loader, ne[i]);
nelements *= ne[i];
}
std::string name;
std::vector<char> tmp(length); // create a buffer
- fin.read(&tmp[0], tmp.size()); // read to buffer
+ loader->read(loader->context, &tmp[0], tmp.size()); // read to buffer
name.assign(&tmp[0], tmp.size());
if (model.tensors.find(name) == model.tensors.end()) {
return false;
}
- fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+ loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
//printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
total_size += ggml_nbytes(tensor);
}
}
- fin.close();
-
return true;
}
}
ggml_graph_compute(ctx0, &gf);
+ //ggml_graph_print(&gf);
}
////////////////////////////////////////////////////////////////////////////
// interface implementation
//
-struct whisper_context * whisper_init(const char * path_model) {
+struct whisper_context * whisper_init_from_file(const char * path_model) {
+ whisper_model_loader loader = {};
+
+ fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model);
+
+ auto fin = std::ifstream(path_model, std::ios::binary);
+ if (!fin) {
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_model);
+ return nullptr;
+ }
+
+ loader.context = &fin;
+ loader.read = [](void * ctx, void * output, size_t read_size) {
+ std::ifstream * fin = (std::ifstream*)ctx;
+ fin->read((char *)output, read_size);
+ return read_size;
+ };
+
+ loader.eof = [](void * ctx) {
+ std::ifstream * fin = (std::ifstream*)ctx;
+ return fin->eof();
+ };
+
+ loader.close = [](void * ctx) {
+ std::ifstream * fin = (std::ifstream*)ctx;
+ fin->close();
+ };
+
+ return whisper_init(&loader);
+}
+
+struct whisper_context * whisper_init_from_buffer(void * buffer, size_t buffer_size) {
+ struct buf_context {
+ uint8_t* buffer;
+ size_t size;
+ size_t current_offset;
+ };
+
+ buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
+ whisper_model_loader loader = {};
+
+ fprintf(stderr, "%s: loading model from buffer\n", __func__);
+
+ loader.context = &ctx;
+
+ loader.read = [](void * ctx, void * output, size_t read_size) {
+ buf_context * buf = reinterpret_cast<buf_context *>(ctx);
+
+ size_t size_to_copy = buf->current_offset + read_size < buf->size ? read_size : buf->size - buf->current_offset;
+
+ memcpy(output, buf->buffer + buf->current_offset, size_to_copy);
+ buf->current_offset += size_to_copy;
+
+ return size_to_copy;
+ };
+
+ loader.eof = [](void * ctx) {
+ buf_context * buf = reinterpret_cast<buf_context *>(ctx);
+
+ return buf->current_offset >= buf->size;
+ };
+
+ loader.close = [](void * /*ctx*/) { };
+
+ return whisper_init(&loader);
+}
+
+struct whisper_context * whisper_init(struct whisper_model_loader * loader) {
ggml_time_init();
whisper_context * ctx = new whisper_context;
ctx->t_start_us = t_start_us;
- if (!whisper_model_load(path_model, *ctx)) {
- fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, path_model);
+ if (!whisper_model_load(loader, *ctx)) {
+ loader->close(loader->context);
+ fprintf(stderr, "%s: failed to load model\n", __func__);
delete ctx;
return nullptr;
}
ctx->t_load_us = ggml_time_us() - t_start_us;
+ loader->close(loader->context);
+
return ctx;
}
}
static int64_t sample_to_timestamp(int i_sample) {
- return (100*i_sample)/WHISPER_SAMPLE_RATE;
+ return (100ll*i_sample)/WHISPER_SAMPLE_RATE;
}
// a cost-function / heuristic that is high for text that takes longer to pronounce
#define GGML_GELU_FP16
#define GGML_SOFT_MAX_UNROLL 4
-#define GGML_VEC_DOT_UNROLL 4
+#define GGML_VEC_DOT_UNROLL 2
#ifdef GGML_USE_ACCELERATE
// uncomment to use vDSP for soft max computation
inline static void ggml_vec_dot_f16_unroll(const int n, const int xs, float * restrict s, void * restrict xv, ggml_fp16_t * restrict y) {
ggml_float sumf[GGML_VEC_DOT_UNROLL] = { 0.0 };
- const ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL] = { xv };
+ ggml_fp16_t * restrict x[GGML_VEC_DOT_UNROLL];
- for (int i = 1; i < GGML_VEC_DOT_UNROLL; ++i) {
+ for (int i = 0; i < GGML_VEC_DOT_UNROLL; ++i) {
x[i] = (ggml_fp16_t *) ((char *) xv + i*xs);
}
ggml_float sum = 0.0;
for (int i = 0; i < n; ++i) {
sum += x[i];
- *s += sum;
}
+ *s = sum;
#else
vDSP_sve(x, 1, s, n);
#endif
assert(ggml_is_scalar(dst));
assert(src0->nb[0] == sizeof(float));
- *(float *) (dst->data) = 0.0f;
-
const int ne00 = src0->ne[0];
const int ne01 = src0->ne[1];
const int ne02 = src0->ne[2];
for (int i03 = 0; i03 < ne03; i03++) {
for (int i02 = 0; i02 < ne02; i02++) {
for (int i01 = 0; i01 < ne01; i01++) {
- *(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3) = 0.0f;
-
ggml_vec_sum_f32(ne00,
(float *) ((char *) dst->data + i01*nb1 + i02*nb2 + i03*nb3),
(float *) ((char *) src0->data + i01*nb01 + i02*nb02 + i03*nb03));
}
}
} else {
- // parallelize by src1 columns using ggml_vec_mad_f32
+ // parallelize by src1 columns using ggml_vec_mad_f16
// each thread has its own work data
// during FINALIZE we accumulate all work data into dst
S[i] = -INFINITY;
}
- // looks like unrolling here does not help
-#if 1
- for (int ic = 0; ic < nek1; ++ic) {
- // k indices
- const int ik3 = iq3;
- const int ik2 = iq2;
- const int ik1 = ic;
+ if (GGML_VEC_DOT_UNROLL > 2 || nek1 % GGML_VEC_DOT_UNROLL != 0) {
+ for (int ic = 0; ic < nek1; ++ic) {
+ // k indices
+ const int ik3 = iq3;
+ const int ik2 = iq2;
+ const int ik1 = ic;
- // S indices
- const int i1 = ik1;
-
- ggml_vec_dot_f16(neq0,
- S + i1,
- (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
- (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
- }
-#else
- GGML_ASSERT(nek1 % GGML_VEC_DOT_UNROLL == 0);
-
- for (int ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
- // k indices
- const int ik3 = iq3;
- const int ik2 = iq2;
- const int ik1 = ic;
+ // S indices
+ const int i1 = ik1;
- // S indices
- const int i1 = ik1;
-
- ggml_vec_dot_f16_unroll(neq0, nbk1,
- S + i1,
- ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
- (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+ ggml_vec_dot_f16(neq0,
+ S + i1,
+ (ggml_fp16_t *) ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+ (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+ }
+ } else {
+ for (int ic = 0; ic < nek1; ic += GGML_VEC_DOT_UNROLL) {
+ // k indices
+ const int ik3 = iq3;
+ const int ik2 = iq2;
+ const int ik1 = ic;
+
+ // S indices
+ const int i1 = ik1;
+
+ ggml_vec_dot_f16_unroll(neq0, nbk1,
+ S + i1,
+ ((char *) k->data + (ik1*nbk1 + ik2*nbk2 + ik3*nbk3)),
+ (ggml_fp16_t *) ((char *) q->data + (iq1*nbq1 + iq2*nbq2 + iq3*nbq3)));
+ }
}
-#endif
// scale
ggml_vec_scale_f32(nek1, S, scale);
S16[i] = GGML_FP32_TO_FP16(S[i]);
}
- GGML_ASSERT(nev1 % GGML_VEC_DOT_UNROLL == 0);
+ if (GGML_VEC_DOT_UNROLL == 1 || (nev1 % GGML_VEC_DOT_UNROLL != 0)) {
+ for (int ic = 0; ic < nev1; ++ic) {
+ // dst indices
+ const int i1 = iq1;
+ const int i2 = iq2;
+ const int i3 = iq3;
- for (int ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
- // dst indices
- const int i1 = iq1;
- const int i2 = iq2;
- const int i3 = iq3;
+ ggml_vec_dot_f16(nek1,
+ (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
+ (ggml_fp16_t *) ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)),
+ S16);
+ }
+ } else {
+ for (int ic = 0; ic < nev1; ic += GGML_VEC_DOT_UNROLL) {
+ // dst indices
+ const int i1 = iq1;
+ const int i2 = iq2;
+ const int i3 = iq3;
- ggml_vec_dot_f16_unroll(nek1, nbv1,
- (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
- ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)),
- S16);
+ ggml_vec_dot_f16_unroll(nek1, nbv1,
+ (float *) ((char *) dst->data + (ic*nb0 + i1*nb1 + i2*nb2 + i3*nb3)),
+ ((char *) v->data + ( ic*nbv1 + i2*nbv2 + i3*nbv3)),
+ S16);
+ }
}
}
}