-#define WHISPER_BUILD
#include "whisper.h"
+#if WHISPER_USE_COREML
+#include "coreml/whisper-encoder.h"
+#endif
#include "ggml.h"
#define WHISPER_PRINT_DEBUG(...)
#endif
-#define WHISPER_USE_FLASH_ATTN
+//#define WHISPER_USE_FLASH_ATTN
//#define WHISPER_USE_FLASH_FF
#define WHISPER_MAX_DECODERS 16
static const size_t MB = 1ull*1024*1024;
static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
- { MODEL_TINY, 14ull*MB },
- { MODEL_BASE, 18ull*MB },
- { MODEL_SMALL, 28ull*MB },
- { MODEL_MEDIUM, 36ull*MB },
- { MODEL_LARGE, 44ull*MB },
+ { MODEL_TINY, 62ull*MB },
+ { MODEL_BASE, 80ull*MB },
+ { MODEL_SMALL, 120ull*MB },
+ { MODEL_MEDIUM, 158ull*MB },
+ { MODEL_LARGE, 198ull*MB },
};
static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
{ MODEL_LARGE, 9ull*MB },
};
-static const std::map<e_model, size_t> MEM_REQ_MODEL = {
- { MODEL_TINY, 74ull*MB },
- { MODEL_BASE, 142ull*MB },
- { MODEL_SMALL, 466ull*MB },
- { MODEL_MEDIUM, 1464ull*MB },
- { MODEL_LARGE, 2952ull*MB },
+static const std::map<ggml_type, std::map<e_model, size_t>> MEM_REQ_MODEL = {
+ { GGML_TYPE_F32,
+ {
+ { MODEL_TINY, 74ull*MB },
+ { MODEL_BASE, 142ull*MB },
+ { MODEL_SMALL, 466ull*MB },
+ { MODEL_MEDIUM, 1464ull*MB },
+ { MODEL_LARGE, 2952ull*MB },
+ },
+ },
+ { GGML_TYPE_F16,
+ {
+ { MODEL_TINY, 74ull*MB },
+ { MODEL_BASE, 142ull*MB },
+ { MODEL_SMALL, 466ull*MB },
+ { MODEL_MEDIUM, 1464ull*MB },
+ { MODEL_LARGE, 2952ull*MB },
+ },
+ },
+ { GGML_TYPE_Q4_0,
+ {
+ { MODEL_TINY, 26ull*MB },
+ { MODEL_BASE, 50ull*MB },
+ { MODEL_SMALL, 154ull*MB },
+ { MODEL_MEDIUM, 470ull*MB },
+ { MODEL_LARGE, 940ull*MB },
+ },
+ },
+ { GGML_TYPE_Q4_1,
+ {
+ { MODEL_TINY, 31ull*MB },
+ { MODEL_BASE, 57ull*MB },
+ { MODEL_SMALL, 181ull*MB },
+ { MODEL_MEDIUM, 559ull*MB },
+ { MODEL_LARGE, 1122ull*MB },
+ },
+ },
+ { GGML_TYPE_Q4_2,
+ {
+ { MODEL_TINY, 26ull*MB },
+ { MODEL_BASE, 50ull*MB },
+ { MODEL_SMALL, 154ull*MB },
+ { MODEL_MEDIUM, 470ull*MB },
+ { MODEL_LARGE, 940ull*MB },
+ },
+ },
+ { GGML_TYPE_Q5_0, // TODO: fix
+ {
+ { MODEL_TINY, 31ull*MB },
+ { MODEL_BASE, 57ull*MB },
+ { MODEL_SMALL, 181ull*MB },
+ { MODEL_MEDIUM, 559ull*MB },
+ { MODEL_LARGE, 1122ull*MB },
+ },
+ },
+ { GGML_TYPE_Q5_1,
+ {
+ { MODEL_TINY, 31ull*MB },
+ { MODEL_BASE, 57ull*MB },
+ { MODEL_SMALL, 181ull*MB },
+ { MODEL_MEDIUM, 559ull*MB },
+ { MODEL_LARGE, 1122ull*MB },
+ },
+ },
};
static const std::map<e_model, size_t> MEM_REQ_KV_SELF = {
};
static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
- { MODEL_TINY, 6ull*MB },
- { MODEL_BASE, 8ull*MB },
- { MODEL_SMALL, 13ull*MB },
- { MODEL_MEDIUM, 22ull*MB },
- { MODEL_LARGE, 33ull*MB },
+ { MODEL_TINY, 30ull*MB },
+ { MODEL_BASE, 38ull*MB },
+ { MODEL_SMALL, 56ull*MB },
+ { MODEL_MEDIUM, 74ull*MB },
+ { MODEL_LARGE, 94ull*MB },
};
static const std::map<e_model, size_t> MEM_REQ_DECODE = {
struct whisper_mel {
int n_len;
+ int n_len_org;
int n_mel;
std::vector<float> data;
int32_t n_text_head = 6;
int32_t n_text_layer = 4;
int32_t n_mels = 80;
- int32_t f16 = 1;
+ int32_t ftype = 1;
};
// audio encoding layer
int lang_id = 0; // english by default
+ std::string path_model; // populated by whisper_init_from_file()
+#ifdef WHISPER_USE_COREML
+ whisper_coreml_context * ctx_coreml = nullptr;
+#endif
+
// [EXPERIMENTAL] token-level timestamps data
int64_t t_beg = 0;
int64_t t_last = 0;
};
struct whisper_context {
- int64_t t_load_us = 0;
+ int64_t t_load_us = 0;
int64_t t_start_us = 0;
- ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 or FP16)
+ ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX)
+ ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16)
whisper_model model;
whisper_vocab vocab;
const ggml_type wtype = cache.k->type;
WHISPER_ASSERT(wtype == cache.v->type);
- WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype));
+ WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_sizef(wtype));
struct ggml_init_params params = {
/*.mem_size =*/ cache.buf.size(),
read_safe(loader, hparams.n_text_head);
read_safe(loader, hparams.n_text_layer);
read_safe(loader, hparams.n_mels);
- read_safe(loader, hparams.f16);
+ read_safe(loader, hparams.ftype);
assert(hparams.n_text_state == hparams.n_audio_state);
model.type = e_model::MODEL_LARGE;
}
- // for the big tensors, we have the option to store the data in 16-bit floats
+ // for the big tensors, we have the option to store the data in 16-bit floats or quantized
// in order to save memory and also to speed up the computation
- wctx.wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+ wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+ if (wctx.wtype == GGML_TYPE_COUNT) {
+ fprintf(stderr, "%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
+ return false;
+ }
- const size_t scale = model.hparams.f16 ? 1 : 2;
+ const size_t scale = model.hparams.ftype ? 1 : 2;
fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
- fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16);
+ fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
fprintf(stderr, "%s: type = %d\n", __func__, model.type);
// print memory requirements
{
// this is the total memory required to run the inference
const size_t mem_required =
- MEM_REQ_SCRATCH0.at (model.type) +
- MEM_REQ_SCRATCH1.at (model.type) +
- MEM_REQ_SCRATCH2.at (model.type) +
- MEM_REQ_SCRATCH3.at (model.type) +
- scale*MEM_REQ_MODEL.at (model.type) +
+ MEM_REQ_SCRATCH0.at(model.type) +
+ MEM_REQ_SCRATCH1.at(model.type) +
+ MEM_REQ_SCRATCH2.at(model.type) +
+ MEM_REQ_SCRATCH3.at(model.type) +
+ scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type) +
scale*MEM_REQ_KV_CROSS.at(model.type) +
scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type));
// always have at least one decoder
wctx.model.buf = new std::vector<uint8_t>();
- wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(model.type));
+ wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type));
// we skip initialization of the state until it is needed
// because it might be that state will always be provided externally.
size_t ctx_size = 0;
const ggml_type wtype = wctx.wtype;
+ const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type
{
const auto & hparams = model.hparams;
// encoder
{
- ctx_size += n_audio_ctx*n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_pe;
+ ctx_size += n_audio_ctx*n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_pe;
- ctx_size += 3*n_mels*n_audio_state*ggml_type_size(wtype); // e_conv_1_w
- ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_1_b
+ ctx_size += 3*n_mels*n_audio_state*ggml_type_sizef(vtype); // e_conv_1_w
+ ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_1_b
- ctx_size += 3*n_audio_state*n_audio_state*ggml_type_size(wtype); // e_conv_2_w
- ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_2_b
+ ctx_size += 3*n_audio_state*n_audio_state*ggml_type_sizef(vtype); // e_conv_2_w
+ ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_2_b
- ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_w;
- ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_b;
+ ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_w;
+ ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_b;
}
// decoder
{
- ctx_size += n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F32); // d_pe;
+ ctx_size += n_text_ctx*n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_pe;
- ctx_size += n_vocab*n_text_state*ggml_type_size(wtype); // d_te;
+ ctx_size += n_vocab*n_text_state*ggml_type_sizef(wtype); // d_te;
- ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_w;
- ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_b;
+ ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_w;
+ ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_b;
}
// encoder layers
{
- ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w
- ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b
+ ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
+ ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
- ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype)); // mlp_0_w
- ctx_size += n_audio_layer*( 4*n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b
+ ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // mlp_0_w
+ ctx_size += n_audio_layer*( 4*n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
- ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype)); // mlp_1_w
- ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b
+ ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // mlp_1_w
+ ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
- ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w
- ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b
+ ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
+ ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
- ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_q_w
- ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b
+ ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_q_w
+ ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
- ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_k_w
+ ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_k_w
- ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_v_w
- ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b
+ ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_v_w
+ ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
- ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_ln_1_w
- ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b
+ ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_ln_1_w
+ ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
}
// decoder layers
{
- ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w
- ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b
+ ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w
+ ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b
- ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype)); // mlp_0_w
- ctx_size += n_text_layer*( 4*n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b
+ ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype)); // mlp_0_w
+ ctx_size += n_text_layer*( 4*n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b
- ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype)); // mlp_1_w
- ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b
+ ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype)); // mlp_1_w
+ ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b
- ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w
- ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b
+ ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w
+ ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_q_w
- ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b
+ ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_q_w
+ ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_k_w
+ ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_k_w
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_v_w
- ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b
+ ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_v_w
+ ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_ln_1_w
- ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b
+ ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_ln_1_w
+ ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b
//
- ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_w
- ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_b
+ ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_w
+ ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_b
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_q_w
- ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_q_b
+ ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_q_w
+ ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_q_b
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_k_w
+ ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_k_w
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_v_w
- ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_v_b
+ ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_v_w
+ ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_v_b
- ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_ln_1_w
- ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_1_b
+ ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_ln_1_w
+ ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b
}
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead
// encoder
{
- model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);
+ model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx);
- model.e_conv_1_w = ggml_new_tensor_3d(ctx, wtype, 3, n_mels, n_audio_state);
+ model.e_conv_1_w = ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state);
model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
- model.e_conv_2_w = ggml_new_tensor_3d(ctx, wtype, 3, n_audio_state, n_audio_state);
+ model.e_conv_2_w = ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state);
model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state);
- model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
- model.e_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+ model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+ model.e_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
// map by name
model.tensors["encoder.positional_embedding"] = model.e_pe;
- model.tensors["encoder.conv1.weight"] = model.e_conv_1_w;
- model.tensors["encoder.conv1.bias"] = model.e_conv_1_b;
+ model.tensors["encoder.conv1.weight"] = model.e_conv_1_w;
+ model.tensors["encoder.conv1.bias"] = model.e_conv_1_b;
- model.tensors["encoder.conv2.weight"] = model.e_conv_2_w;
- model.tensors["encoder.conv2.bias"] = model.e_conv_2_b;
+ model.tensors["encoder.conv2.weight"] = model.e_conv_2_w;
+ model.tensors["encoder.conv2.bias"] = model.e_conv_2_b;
- model.tensors["encoder.ln_post.weight"] = model.e_ln_w;
- model.tensors["encoder.ln_post.bias"] = model.e_ln_b;
+ model.tensors["encoder.ln_post.weight"] = model.e_ln_w;
+ model.tensors["encoder.ln_post.bias"] = model.e_ln_b;
for (int i = 0; i < n_audio_layer; ++i) {
auto & layer = model.layers_encoder[i];
- layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
- layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+ layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+ layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
- layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state);
- layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state);
+ layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state);
+ layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state);
- layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state);
- layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+ layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state);
+ layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
- layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
- layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+ layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+ layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
- layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
- layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+ layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
+ layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
- layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
+ layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
- layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
- layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+ layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
+ layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
- layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
- layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
+ layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state);
+ layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state);
// map by name
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b;
+ model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
+ model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b;
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w;
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b;
+ model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w;
+ model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b;
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w;
- model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b;
+ model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w;
+ model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b;
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w;
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b;
+ model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w;
+ model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b;
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b;
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w;
+ model.tensors["encoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w;
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b;
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w;
- model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b;
+ model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w;
+ model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b;
}
}
// decoder
{
- model.d_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx);
+ model.d_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx);
- model.d_te = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab);
+ model.d_te = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab);
model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
// map by name
- model.tensors["decoder.positional_embedding"] = model.d_pe;
+ model.tensors["decoder.positional_embedding"] = model.d_pe;
model.tensors["decoder.token_embedding.weight"] = model.d_te;
- model.tensors["decoder.ln.weight"] = model.d_ln_w;
- model.tensors["decoder.ln.bias"] = model.d_ln_b;
+ model.tensors["decoder.ln.weight"] = model.d_ln_w;
+ model.tensors["decoder.ln.bias"] = model.d_ln_b;
for (int i = 0; i < n_text_layer; ++i) {
auto & layer = model.layers_decoder[i];
- layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
- layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+ layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+ layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
- layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state);
- layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state);
+ layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state);
+ layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state);
- layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state);
- layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+ layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state);
+ layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
- layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
- layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+ layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+ layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
- layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
- layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+ layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
+ layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
- layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
+ layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
- layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
- layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+ layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
+ layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
- layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
- layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+ layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
+ layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
- layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
- layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+ layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+ layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
- layer.cross_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
- layer.cross_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+ layer.cross_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
+ layer.cross_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
- layer.cross_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
+ layer.cross_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
- layer.cross_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
- layer.cross_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+ layer.cross_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
+ layer.cross_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
- layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
- layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
+ layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state);
+ layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state);
// map by name
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b;
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w;
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b;
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w;
- model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b;
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w;
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b;
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b;
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w;
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b;
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w;
- model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b;
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.weight"] = layer.cross_attn_ln_0_w;
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.bias"] = layer.cross_attn_ln_0_b;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.weight"] = layer.cross_attn_ln_0_w;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.bias"] = layer.cross_attn_ln_0_b;
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.weight"] = layer.cross_attn_q_w;
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.bias"] = layer.cross_attn_q_b;
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.key.weight"] = layer.cross_attn_k_w;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.key.weight"] = layer.cross_attn_k_w;
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.weight"] = layer.cross_attn_v_w;
model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.bias"] = layer.cross_attn_v_b;
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.weight"] = layer.cross_attn_ln_1_w;
- model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.bias"] = layer.cross_attn_ln_1_b;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.weight"] = layer.cross_attn_ln_1_w;
+ model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.bias"] = layer.cross_attn_ln_1_b;
}
}
}
while (true) {
int32_t n_dims;
int32_t length;
- int32_t ftype;
+ int32_t ttype;
read_safe(loader, n_dims);
read_safe(loader, length);
- read_safe(loader, ftype);
+ read_safe(loader, ttype);
if (loader->eof(loader->context)) {
break;
}
- int64_t nelements = 1;
- int64_t ne[3] = { 1, 1, 1 };
+ int32_t nelements = 1;
+ int32_t ne[3] = { 1, 1, 1 };
for (int i = 0; i < n_dims; ++i) {
- int32_t ne_cur;
- read_safe(loader, ne_cur);
- ne[i] = ne_cur;
+ read_safe(loader, ne[i]);
nelements *= ne[i];
}
}
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld, %lld], expected [%lld, %lld, %lld]\n",
- __func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]);
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
+ __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], ne[0], ne[1], ne[2]);
return false;
}
- const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t);
+ const size_t bpe = ggml_type_size(ggml_type(ttype));
- if (nelements*bpe != ggml_nbytes(tensor)) {
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %llu\n",
+ if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
return false;
}
loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
BYTESWAP_TENSOR(tensor);
- //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
+ //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1024.0/1024.0);
total_size += ggml_nbytes(tensor);
model.n_loaded++;
}
struct ggml_tensor * cur;
- // convolution + gelu
- {
- wstate.use_buf(ctx0, 1);
+#ifndef WHISPER_USE_COREML
+ const bool use_coreml = false;
+#else
+ const bool use_coreml = wstate.ctx_coreml != nullptr;
+#endif
- cur = ggml_conv_1d_1s(ctx0, model.e_conv_1_w, mel);
- cur = ggml_add(ctx0,
- ggml_repeat(ctx0,
- model.e_conv_1_b,
- cur),
- cur);
+ if (!use_coreml) {
+ // convolution + gelu
+ {
+ wstate.use_buf(ctx0, 1);
- cur = ggml_gelu(ctx0, cur);
+ cur = ggml_conv_1d_1s(ctx0, model.e_conv_1_w, mel);
+ cur = ggml_add(ctx0,
+ ggml_repeat(ctx0,
+ model.e_conv_1_b,
+ cur),
+ cur);
- wstate.use_buf(ctx0, 0);
+ cur = ggml_gelu(ctx0, cur);
- cur = ggml_conv_1d_2s(ctx0, model.e_conv_2_w, cur);
- cur = ggml_add(ctx0,
- ggml_repeat(ctx0,
- model.e_conv_2_b,
- cur),
- cur);
+ wstate.use_buf(ctx0, 0);
- cur = ggml_gelu(ctx0, cur);
- }
+ cur = ggml_conv_1d_2s(ctx0, model.e_conv_2_w, cur);
+ cur = ggml_add(ctx0,
+ ggml_repeat(ctx0,
+ model.e_conv_2_b,
+ cur),
+ cur);
- wstate.use_buf(ctx0, 3);
+ cur = ggml_gelu(ctx0, cur);
+ }
- // ===================================================================
- // NOTE: experimenting with partial evaluation of the encoder (ignore)
- //static int iter = -1;
- //const int n_iter = 1500/n_ctx;
+ wstate.use_buf(ctx0, 3);
- //iter = (iter + 1) % n_iter;
+ // ===================================================================
+ // NOTE: experimenting with partial evaluation of the encoder (ignore)
+ //static int iter = -1;
+ //const int n_iter = 1500/n_ctx;
- //if (iter == 0) {
- // memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
- // memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
- //}
+ //iter = (iter + 1) % n_iter;
- static int iter = 0;
+ //if (iter == 0) {
+ // memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k));
+ // memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v));
+ //}
- const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
- const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
+ static int iter = 0;
- struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
+ const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
+ const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
- cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
+ struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset);
- // ===================================================================
+ cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur));
- // original:
- //cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
+ // ===================================================================
- struct ggml_tensor * inpL = cur;
+ // original:
+ //cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur));
- for (int il = 0; il < n_layer; ++il) {
- const auto & layer = model.layers_encoder[il];
+ struct ggml_tensor * inpL = cur;
- // norm
- {
- wstate.use_buf(ctx0, 0);
+ for (int il = 0; il < n_layer; ++il) {
+ const auto & layer = model.layers_encoder[il];
- cur = ggml_norm(ctx0, inpL);
+ // norm
+ {
+ wstate.use_buf(ctx0, 0);
- // cur = ln_0_w*cur + ln_0_b
- cur = ggml_add(ctx0,
- ggml_mul(ctx0,
- ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
- cur),
- ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
- }
+ cur = ggml_norm(ctx0, inpL);
- // self-attention
- {
- wstate.use_buf(ctx0, 1);
+ // cur = ln_0_w*cur + ln_0_b
+ cur = ggml_add(ctx0,
+ ggml_mul(ctx0,
+ ggml_repeat(ctx0, layer.attn_ln_0_w, cur),
+ cur),
+ ggml_repeat(ctx0, layer.attn_ln_0_b, cur));
+ }
- struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
- layer.attn_q_w,
- cur);
+ // self-attention
+ {
+ wstate.use_buf(ctx0, 1);
- Qcur = ggml_add(ctx0,
- ggml_repeat(ctx0,
- layer.attn_q_b,
- Qcur),
- Qcur);
+ struct ggml_tensor * Qcur = ggml_mul_mat(ctx0,
+ layer.attn_q_w,
+ cur);
- //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+ Qcur = ggml_add(ctx0,
+ ggml_repeat(ctx0,
+ layer.attn_q_b,
+ Qcur),
+ Qcur);
- // note: no bias for Key
- struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
- layer.attn_k_w,
- cur);
+ //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
- //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
+ // note: no bias for Key
+ struct ggml_tensor * Kcur = ggml_mul_mat(ctx0,
+ layer.attn_k_w,
+ cur);
- struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
- layer.attn_v_w,
- cur);
+ //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25)));
- Vcur = ggml_add(ctx0,
- ggml_repeat(ctx0,
- layer.attn_v_b,
- Vcur),
- Vcur);
+ struct ggml_tensor * Vcur = ggml_mul_mat(ctx0,
+ layer.attn_v_w,
+ cur);
- // ------
+ Vcur = ggml_add(ctx0,
+ ggml_repeat(ctx0,
+ layer.attn_v_b,
+ Vcur),
+ Vcur);
- wstate.use_buf(ctx0, 0);
+ // ------
+
+ wstate.use_buf(ctx0, 0);
#ifdef WHISPER_USE_FLASH_ATTN
- struct ggml_tensor * Q =
- ggml_permute(ctx0,
- ggml_cpy(ctx0,
- Qcur,
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
- 0, 2, 1, 3);
+ struct ggml_tensor * Q =
+ ggml_permute(ctx0,
+ ggml_cpy(ctx0,
+ Qcur,
+ ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
+ 0, 2, 1, 3);
+
+ struct ggml_tensor * K =
+ ggml_permute(ctx0,
+ ggml_cpy(ctx0,
+ Kcur,
+ ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
+ 0, 2, 1, 3);
+
+ struct ggml_tensor * V =
+ ggml_cpy(ctx0,
+ ggml_permute(ctx0,
+ ggml_reshape_3d(ctx0,
+ Vcur,
+ n_state/n_head, n_head, n_ctx),
+ 1, 2, 0, 3),
+ ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head));
+
+ struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
+#else
+ struct ggml_tensor * Q =
+ ggml_permute(ctx0,
+ ggml_cpy(ctx0,
+ Qcur,
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
+ 0, 2, 1, 3);
+
+ struct ggml_tensor * K =
+ ggml_permute(ctx0,
+ ggml_cpy(ctx0,
+ Kcur,
+ ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)),
+ 0, 2, 1, 3);
+
+ // K * Q
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+
+ struct ggml_tensor * KQ_scaled =
+ ggml_scale(ctx0,
+ KQ,
+ ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
+ );
+
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
+
+ struct ggml_tensor * V =
+ ggml_cpy(ctx0,
+ ggml_permute(ctx0,
+ ggml_reshape_3d(ctx0,
+ Vcur,
+ n_state/n_head, n_head, n_ctx),
+ 1, 2, 0, 3),
+ ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)
+ );
+
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
+#endif
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
- struct ggml_tensor * K =
- ggml_permute(ctx0,
- ggml_cpy(ctx0,
- Kcur,
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
- 0, 2, 1, 3);
+ wstate.use_buf(ctx0, 1);
- struct ggml_tensor * V =
- ggml_cpy(ctx0,
- ggml_permute(ctx0,
- ggml_reshape_3d(ctx0,
- Vcur,
- n_state/n_head, n_head, n_ctx),
- 1, 2, 0, 3),
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head));
-
- struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false);
-#else
- struct ggml_tensor * Q =
- ggml_permute(ctx0,
- ggml_cpy(ctx0,
- Qcur,
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)),
- 0, 2, 1, 3);
+ cur = ggml_cpy(ctx0,
+ KQV_merged,
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
+ }
- struct ggml_tensor * K =
- ggml_permute(ctx0,
- ggml_cpy(ctx0,
- Kcur,
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
- 0, 2, 1, 3);
+ // projection
+ {
+ wstate.use_buf(ctx0, 0);
- // K * Q
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+ cur = ggml_mul_mat(ctx0,
+ layer.attn_ln_1_w,
+ cur);
- struct ggml_tensor * KQ_scaled =
- ggml_scale(ctx0,
- KQ,
- ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head))
- );
+ wstate.use_buf(ctx0, 1);
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
+ cur = ggml_add(ctx0,
+ ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
+ cur);
+ }
- //struct ggml_tensor * V_trans =
- // ggml_permute(ctx0,
- // ggml_cpy(ctx0,
- // Vcur,
- // ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
- // 1, 2, 0, 3);
+ wstate.use_buf(ctx0, 2);
- //struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+ // add the input
+ cur = ggml_add(ctx0, cur, inpL);
- struct ggml_tensor * V =
- ggml_cpy(ctx0,
- ggml_permute(ctx0,
- ggml_reshape_3d(ctx0,
- Vcur,
- n_state/n_head, n_head, n_ctx),
- 0, 2, 1, 3),
- ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_ctx, n_head)
- );
-
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, ggml_transpose(ctx0, V), KQ_soft_max);
-#endif
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+ struct ggml_tensor * inpFF = cur;
- wstate.use_buf(ctx0, 1);
+ // feed-forward network
+ {
+ // norm
+ {
+ wstate.use_buf(ctx0, 0);
- cur = ggml_cpy(ctx0,
- KQV_merged,
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx));
- }
+ cur = ggml_norm(ctx0, inpFF);
- // projection
- {
- wstate.use_buf(ctx0, 0);
+ wstate.use_buf(ctx0, 1);
- cur = ggml_mul_mat(ctx0,
- layer.attn_ln_1_w,
- cur);
+ // cur = mlp_ln_w*cur + mlp_ln_b
+ cur = ggml_add(ctx0,
+ ggml_mul(ctx0,
+ ggml_repeat(ctx0, layer.mlp_ln_w, cur),
+ cur),
+ ggml_repeat(ctx0, layer.mlp_ln_b, cur));
+ }
- wstate.use_buf(ctx0, 1);
+#ifdef WHISPER_USE_FLASH_FF
+ wstate.use_buf(ctx0, 0);
- cur = ggml_add(ctx0,
- ggml_repeat(ctx0, layer.attn_ln_1_b, cur),
- cur);
- }
+ cur = ggml_flash_ff(ctx0,
+ ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)),
+ layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
+#else
+ wstate.use_buf(ctx0, 0);
- wstate.use_buf(ctx0, 2);
+ // fully connected
+ cur = ggml_mul_mat(ctx0,
+ layer.mlp_0_w,
+ cur);
- // add the input
- cur = ggml_add(ctx0, cur, inpL);
+ wstate.use_buf(ctx0, 1);
- struct ggml_tensor * inpFF = cur;
+ cur = ggml_add(ctx0,
+ ggml_repeat(ctx0, layer.mlp_0_b, cur),
+ cur);
- // feed-forward network
- {
- // norm
- {
wstate.use_buf(ctx0, 0);
- cur = ggml_norm(ctx0, inpFF);
+ // GELU activation
+ cur = ggml_gelu(ctx0, cur);
wstate.use_buf(ctx0, 1);
- // cur = mlp_ln_w*cur + mlp_ln_b
- cur = ggml_add(ctx0,
- ggml_mul(ctx0,
- ggml_repeat(ctx0, layer.mlp_ln_w, cur),
- cur),
- ggml_repeat(ctx0, layer.mlp_ln_b, cur));
- }
+ // projection
+ cur = ggml_mul_mat(ctx0,
+ layer.mlp_1_w,
+ cur);
-#ifdef WHISPER_USE_FLASH_FF
- wstate.use_buf(ctx0, 0);
+ wstate.use_buf(ctx0, 0);
- cur = ggml_flash_ff(ctx0,
- ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.wtype, n_state, n_ctx)),
- layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b);
-#else
- wstate.use_buf(ctx0, 0);
+ cur = ggml_add(ctx0,
+ ggml_repeat(ctx0, layer.mlp_1_b, cur),
+ cur);
+#endif
+ }
- // fully connected
- cur = ggml_mul_mat(ctx0,
- layer.mlp_0_w,
- cur);
+ wstate.use_buf(ctx0, 3);
- wstate.use_buf(ctx0, 1);
+ inpL = ggml_add(ctx0, cur, inpFF);
+ }
- cur = ggml_add(ctx0,
- ggml_repeat(ctx0, layer.mlp_0_b, cur),
- cur);
+ cur = inpL;
+ // norm
+ {
wstate.use_buf(ctx0, 0);
- // GELU activation
- cur = ggml_gelu(ctx0, cur);
+ cur = ggml_norm(ctx0, cur);
wstate.use_buf(ctx0, 1);
- // projection
- cur = ggml_mul_mat(ctx0,
- layer.mlp_1_w,
- cur);
-
- wstate.use_buf(ctx0, 0);
-
+ // cur = ln_f_g*cur + ln_f_b
cur = ggml_add(ctx0,
- ggml_repeat(ctx0, layer.mlp_1_b, cur),
- cur);
-#endif
+ ggml_mul(ctx0,
+ ggml_repeat(ctx0, model.e_ln_w, cur),
+ cur),
+ ggml_repeat(ctx0, model.e_ln_b, cur));
}
- wstate.use_buf(ctx0, 3);
+ wstate.use_buf(ctx0, -1);
- inpL = ggml_add(ctx0, cur, inpFF);
- }
-
- cur = inpL;
-
- // norm
- {
- wstate.use_buf(ctx0, 0);
-
- cur = ggml_norm(ctx0, cur);
+ // run the computation
+ {
+ struct ggml_cgraph gf = {};
+ gf.n_threads = n_threads;
- wstate.use_buf(ctx0, 1);
+ ggml_build_forward_expand(&gf, cur);
+ ggml_graph_compute(ctx0, &gf);
- // cur = ln_f_g*cur + ln_f_b
- cur = ggml_add(ctx0,
- ggml_mul(ctx0,
- ggml_repeat(ctx0, model.e_ln_w, cur),
- cur),
- ggml_repeat(ctx0, model.e_ln_b, cur));
+ //ggml_graph_print(&gf);
+ }
}
-
- wstate.use_buf(ctx0, -1);
-
- // run the computation
+#ifdef WHISPER_USE_COREML
+ else
{
- struct ggml_cgraph gf = {};
- gf.n_threads = n_threads;
+ wstate.use_buf(ctx0, -1);
- ggml_build_forward_expand(&gf, cur);
- ggml_graph_compute(ctx0, &gf);
+ cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx);
- //ggml_graph_print(&gf);
+ whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data);
}
+#endif
// cur
//{
}
}
+static void log_mel_spectrogram_worker_thread(int ith, const std::vector<float> &hann, const float *samples,
+ int n_samples, int fft_size, int fft_step, int n_threads,
+ const whisper_filters &filters, bool speed_up, whisper_mel &mel) {
+ std::vector<float> fft_in(fft_size, 0.0);
+ std::vector<float> fft_out(2 * fft_size);
+ int n_fft = 1 + (speed_up ? fft_size / 4 : fft_size / 2);
+
+ for (int i = ith; i < mel.n_len; i += n_threads) {
+ const int offset = i * fft_step;
+
+ // apply Hanning window
+ for (int j = 0; j < fft_size; j++) {
+ if (offset + j < n_samples) {
+ fft_in[j] = hann[j] * samples[offset + j];
+ } else {
+ fft_in[j] = 0.0;
+ }
+ }
+
+ // FFT -> mag^2
+ fft(fft_in, fft_out);
+
+ for (int j = 0; j < fft_size; j++) {
+ fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
+ }
+ for (int j = 1; j < fft_size / 2; j++) {
+ fft_out[j] += fft_out[fft_size - j];
+ }
+
+ if (speed_up) {
+ // scale down in the frequency domain results in a speed up in the time domain
+ for (int j = 0; j < n_fft; j++) {
+ fft_out[j] = 0.5 * (fft_out[2 * j] + fft_out[2 * j + 1]);
+ }
+ }
+
+ // mel spectrogram
+ for (int j = 0; j < mel.n_mel; j++) {
+ double sum = 0.0;
+
+ // unroll loop (suggested by GH user @lunixbochs)
+ int k = 0;
+ for (k = 0; k < n_fft - 3; k += 4) {
+ sum +=
+ fft_out[k + 0] * filters.data[j*n_fft + k + 0] +
+ fft_out[k + 1] * filters.data[j*n_fft + k + 1] +
+ fft_out[k + 2] * filters.data[j*n_fft + k + 2] +
+ fft_out[k + 3] * filters.data[j*n_fft + k + 3];
+ }
+
+ // handle n_fft remainder
+ for (; k < n_fft; k++) {
+ sum += fft_out[k] * filters.data[j * n_fft + k];
+ }
+
+ sum = log10(std::max(sum, 1e-10));
+
+ mel.data[j * mel.n_len + i] = sum;
+ }
+ }
+}
+
// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124
static bool log_mel_spectrogram(
whisper_state & wstate,
hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size)));
}
- mel.n_mel = n_mel;
- mel.n_len = (n_samples)/fft_step;
- mel.data.resize(mel.n_mel*mel.n_len);
-
- const int n_fft = 1 + (speed_up ? fft_size/4 : fft_size/2);
-
- //printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
- //printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate);
-
- std::vector<std::thread> workers(n_threads);
- for (int iw = 0; iw < n_threads; ++iw) {
- workers[iw] = std::thread([&](int ith) {
- std::vector<float> fft_in;
- fft_in.resize(fft_size);
- for (int i = 0; i < fft_size; i++) {
- fft_in[i] = 0.0;
- }
-
- std::vector<float> fft_out;
- fft_out.resize(2*fft_size);
+ mel.n_mel = n_mel;
+ mel.n_len = n_samples/fft_step;
+ mel.n_len_org = mel.n_len;
- for (int i = ith; i < mel.n_len; i += n_threads) {
- const int offset = i*fft_step;
+ std::vector<float> samples_padded;
- // apply Hanning window
- for (int j = 0; j < fft_size; j++) {
- if (offset + j < n_samples) {
- fft_in[j] = hann[j]*samples[offset + j];
- } else {
- fft_in[j] = 0.0;
- }
- }
+ // pad audio with at least one extra chunk of zeros
+ {
+ const int pad = (100*WHISPER_CHUNK_SIZE)/2;
- // FFT -> mag^2
- fft(fft_in, fft_out);
+ if (mel.n_len % pad != 0) {
+ mel.n_len = (mel.n_len/pad + 1)*pad;
+ }
+ mel.n_len += pad;
- for (int j = 0; j < fft_size; j++) {
- fft_out[j] = (fft_out[2*j + 0]*fft_out[2*j + 0] + fft_out[2*j + 1]*fft_out[2*j + 1]);
- }
- for (int j = 1; j < fft_size/2; j++) {
- //if (i == 0) {
- // printf("%d: %f %f\n", j, fft_out[j], fft_out[fft_size - j]);
- //}
- fft_out[j] += fft_out[fft_size - j];
- }
- if (i == 0) {
- //for (int j = 0; j < fft_size; j++) {
- // printf("%d: %e\n", j, fft_out[j]);
- //}
- }
+ samples_padded.resize(mel.n_len*fft_step);
+ memcpy(samples_padded.data(), samples, n_samples*sizeof(float));
+ memset(samples_padded.data() + n_samples, 0, (mel.n_len*fft_step - n_samples)*sizeof(float));
- if (speed_up) {
- // scale down in the frequency domain results in a speed up in the time domain
- for (int j = 0; j < n_fft; j++) {
- fft_out[j] = 0.5*(fft_out[2*j] + fft_out[2*j + 1]);
- }
- }
+ samples = samples_padded.data();
+ }
- // mel spectrogram
- for (int j = 0; j < mel.n_mel; j++) {
- double sum = 0.0;
+ mel.data.resize(mel.n_mel*mel.n_len);
- for (int k = 0; k < n_fft; k++) {
- sum += fft_out[k]*filters.data[j*n_fft + k];
- }
- if (sum < 1e-10) {
- sum = 1e-10;
- }
+ //printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
+ //printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate);
- sum = log10(sum);
+ {
+ std::vector<std::thread> workers(n_threads - 1);
+ for (int iw = 0; iw < n_threads - 1; ++iw) {
+ workers[iw] = std::thread(
+ log_mel_spectrogram_worker_thread, iw + 1, std::cref(hann), samples,
+ n_samples, fft_size, fft_step, n_threads,
+ std::cref(filters), speed_up, std::ref(mel));
+ }
- mel.data[j*mel.n_len + i] = sum;
- }
- }
- }, iw);
- }
+ // main thread
+ log_mel_spectrogram_worker_thread(0, hann, samples, n_samples, fft_size, fft_step, n_threads, filters, speed_up, mel);
- for (int iw = 0; iw < n_threads; ++iw) {
- workers[iw].join();
+ for (int iw = 0; iw < n_threads - 1; ++iw) {
+ workers[iw].join();
+ }
}
// clamping and normalization
wstate.t_mel_us += ggml_time_us() - t_start_us;
+ //printf("mel.n_len() = %d, divided by 1500: %f, n_samples / fft_step: %d\n", mel.n_len, mel.n_len / 1500.0, n_samples / fft_step);
+
return true;
}
int n = word.size();
while (i < n) {
int j = n;
+ bool found = false;
while (j > i) {
- auto it = vocab.token_to_id.find(word.substr(i, j-i));
+ auto sub = word.substr(i, j-i);
+ auto it = vocab.token_to_id.find(sub);
if (it != vocab.token_to_id.end()) {
tokens.push_back(it->second);
i = j;
+ found = true;
break;
}
--j;
}
- if (i == n) {
- break;
- }
- if (j == i) {
- auto sub = word.substr(i, 1);
- if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) {
- tokens.push_back(vocab.token_to_id.at(sub));
- } else {
- fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data());
- }
+ if (!found) {
+ fprintf(stderr, "unknown token \n");
++i;
}
}
// interface implementation
//
+#ifdef WHISPER_USE_COREML
+// replace .bin with -encoder.mlmodelc
+static std::string whisper_get_coreml_path_encoder(std::string path_bin) {
+ auto pos = path_bin.rfind('.');
+ if (pos != std::string::npos) {
+ path_bin = path_bin.substr(0, pos);
+ }
+
+ path_bin += "-encoder.mlmodelc";
+
+ return path_bin;
+}
+#endif
+
struct whisper_state * whisper_init_state(whisper_context * ctx) {
whisper_state * state = new whisper_state;
- const size_t scale = ctx->model.hparams.f16 ? 1 : 2;
+ const size_t scale = ctx->model.hparams.ftype ? 1 : 2;
- if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->wtype, ctx->model.hparams.n_text_ctx)) {
+ if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
+ delete state;
return nullptr;
}
fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
}
- if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->wtype, ctx->model.hparams.n_audio_ctx)) {
+ if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
+ delete state;
return nullptr;
}
fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
}
+#ifdef WHISPER_USE_COREML
+ const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
+
+ fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
+ fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
+
+ state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
+ if (!state->ctx_coreml) {
+ fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
+#ifndef WHISPER_COREML_ALLOW_FALLBACK
+ return nullptr;
+#endif
+ } else {
+ fprintf(stderr, "%s: Core ML model loaded\n", __func__);
+ }
+#endif
+
state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx);
state->logits_id.reserve(ctx->model.hparams.n_vocab);
}
struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
- whisper_model_loader loader = {};
fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model);
return nullptr;
}
+ whisper_model_loader loader = {};
+
loader.context = &fin;
+
loader.read = [](void * ctx, void * output, size_t read_size) {
std::ifstream * fin = (std::ifstream*)ctx;
fin->read((char *)output, read_size);
};
buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
- whisper_model_loader loader = {};
fprintf(stderr, "%s: loading model from buffer\n", __func__);
+ whisper_model_loader loader = {};
+
loader.context = &ctx;
loader.read = [](void * ctx, void * output, size_t read_size) {
kv_cache_free(state->decoders[i].kv_self);
}
+#ifdef WHISPER_USE_COREML
+ if (state->ctx_coreml != nullptr) {
+ whisper_coreml_free(state->ctx_coreml);
+ state->ctx_coreml = nullptr;
+ }
+#endif
+
delete state;
}
}
return -1;
}
- state->mel.n_len = n_len;
- state->mel.n_mel = n_mel;
+ state->mel.n_len = n_len;
+ state->mel.n_len_org = n_len;
+ state->mel.n_mel = n_mel;
state->mel.data.resize(n_len*n_mel);
memcpy(state->mel.data.data(), data, n_len*n_mel*sizeof(float));
fprintf(stderr, "%s: unknown language '%s'\n", __func__, lang);
return -1;
}
-
return g_lang.at(lang).first;
}
return -1;
}
- if (seek >= state->mel.n_len) {
- fprintf(stderr, "%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len*10);
+ if (seek >= state->mel.n_len_org) {
+ fprintf(stderr, "%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len_org*10);
return -2;
}
return ctx->model.hparams.n_mels;
}
-int whisper_model_f16(struct whisper_context * ctx) {
- return ctx->model.hparams.f16;
+int whisper_model_ftype(struct whisper_context * ctx) {
+ return ctx->model.hparams.ftype;
}
int whisper_model_type(struct whisper_context * ctx) {
}
int whisper_n_len_from_state(struct whisper_state * state) {
- return state->mel.n_len;
+ return state->mel.n_len_org;
}
int whisper_n_len(struct whisper_context * ctx) {
- return ctx->state->mel.n_len;
+ return ctx->state->mel.n_len_org;
}
int whisper_n_vocab(struct whisper_context * ctx) {
}
}
+static int whisper_has_coreml(void) {
+#ifdef WHISPER_USE_COREML
+ return 1;
+#else
+ return 0;
+#endif
+}
+
const char * whisper_print_system_info(void) {
static std::string s;
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
+ s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
return s.c_str();
}
/*.max_initial_ts =*/ 1.0f,
/*.length_penalty =*/ -1.0f,
- /*.temperature_inc =*/ 0.0f, // TODO: temporary disabled until improve performance
+ /*.temperature_inc =*/ 0.4f,
/*.entropy_thold =*/ 2.4f,
/*.logprob_thold =*/ -1.0f,
/*.no_speech_thold =*/ 0.6f,
case WHISPER_SAMPLING_GREEDY:
{
result.greedy = {
- /*.best_of =*/ 1,
+ /*.best_of =*/ 2, // TODO: increase to 5 when we speed-up batch decoding
};
} break;
case WHISPER_SAMPLING_BEAM_SEARCH:
{
result.beam_search = {
- /*.beam_size =*/ 5,
+ /*.beam_size =*/ 2, // TODO: increase to 5 when we speed-up batch decoding
/*.patience =*/ -1.0f,
};
// trim from start (in place)
static inline void ltrim(std::string &s) {
- s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) {
- return !std::isspace(ch);
+ s.erase(s.begin(), std::find_if_not(s.begin(), s.end(), [](unsigned char ch) {
+ return std::isspace(ch);
}));
}
// trim from end (in place)
static inline void rtrim(std::string &s) {
- s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) {
- return !std::isspace(ch);
+ s.erase(std::find_if_not(s.rbegin(), s.rend(), [](unsigned char ch) {
+ return std::isspace(ch);
}).base(), s.end());
}
}
const int seek_start = params.offset_ms/10;
- const int seek_end = seek_start + (params.duration_ms == 0 ? whisper_n_len_from_state(state) : params.duration_ms/10);
+ const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10;
// if length of spectrogram is less than 1s (100 samples), then return
// basically don't process anything that is less than 1s
prompt_past.clear();
}
- // initial prompt
- if (!params.prompt_tokens && params.initial_prompt) {
+ // prepare prompt
+ {
std::vector<whisper_token> prompt_tokens;
- prompt_tokens.resize(1024);
- prompt_tokens.resize(whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size()));
- params.prompt_tokens = prompt_tokens.data();
- params.prompt_n_tokens = prompt_tokens.size();
- }
- // prepend the prompt tokens to the prompt_past
- if (params.prompt_tokens && params.prompt_n_tokens > 0) {
- // parse tokens from the pointer
- for (int i = 0; i < params.prompt_n_tokens; i++) {
- prompt_past.push_back(params.prompt_tokens[i]);
+ // initial prompt
+ if (!params.prompt_tokens && params.initial_prompt) {
+ prompt_tokens.resize(1024);
+ prompt_tokens.resize(whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size()));
+ params.prompt_tokens = prompt_tokens.data();
+ params.prompt_n_tokens = prompt_tokens.size();
+ }
+
+ // prepend the prompt tokens to the prompt_past
+ if (params.prompt_tokens && params.prompt_n_tokens > 0) {
+ // parse tokens from the pointer
+ for (int i = 0; i < params.prompt_n_tokens; i++) {
+ prompt_past.push_back(params.prompt_tokens[i]);
+ }
+ std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
}
- std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
}
// overwrite audio_ctx, max allowed is hparams.n_audio_ctx
}
// was the decoding successful for the current temperature?
- {
+ // do fallback only if:
+ // - we are not at the last temperature
+ // - we are not at the end of the audio (3 sec)
+ if (it != (int) temperatures.size() - 1 &&
+ seek_end - seek > 10*WHISPER_CHUNK_SIZE) {
bool success = true;
const auto & decoder = state->decoders[best_decoder_id];
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
std::vector<char> buf(4llu*N_max*N_max*sizeof(float) + 4*256);
+ // put a bunch of random data in the buffer
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
for (int j = 0; j < (int) sizes.size(); j++) {
+ int n_q4_0 = 0;
+ int n_q4_1 = 0;
int n_fp16 = 0;
int n_fp32 = 0;
// GFLOPS/s
+ double s_q4_0 = 0.0;
+ double s_q4_1 = 0.0;
double s_fp16 = 0.0;
double s_fp32 = 0.0;
const size_t N = sizes[j];
- for (int k = 0; k < 2; ++k) {
- const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32;
+ for (int k = 0; k < 4; ++k) {
+ const ggml_type wtype =
+ k == 0 ? GGML_TYPE_Q4_0 :
+ k == 1 ? GGML_TYPE_Q4_1 :
+ k == 2 ? GGML_TYPE_F16 :
+ GGML_TYPE_F32;
- double & s = k == 0 ? s_fp16 : s_fp32;
- int & n = k == 0 ? n_fp16 : n_fp32;
+ double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_fp16 : s_fp32;
+ int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_fp16 : n_fp32;
struct ggml_init_params gparams = {
/*.mem_size =*/ buf.size(),
s = ((2.0*N*N*N*n)/tsum)*1e-9;
}
- snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n",
- N, N, s_fp16, n_fp16, s_fp32, n_fp32);
+ snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) / Q4_1 %7.1f GFLOPS (%3d runs) / F16 %7.1f GFLOPS (%3d runs) / F32 %7.1f GFLOPS (%3d runs)\n",
+ N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_fp16, n_fp16, s_fp32, n_fp32);
s += strbuf;
}