From: Georgi Gerganov Date: Sun, 30 Apr 2023 16:03:35 +0000 (+0300) Subject: ggml, whisper : sync whisper.cpp (GGML_FTYPE + Q5 WASM SIMD) X-Git-Tag: upstream/0.0.1642~1502 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=cf6815bfd21d9b77785b3ddda30f3d58f47c8d4a;p=pkg%2Fggml%2Fsources%2Fggml ggml, whisper : sync whisper.cpp (GGML_FTYPE + Q5 WASM SIMD) --- diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp index 5835dd70..226f2b14 100644 --- a/examples/common-ggml.cpp +++ b/examples/common-ggml.cpp @@ -1,6 +1,7 @@ #include "common-ggml.h" #include +#include static const std::map GGML_FTYPE_MAP = { {"q4_0", GGML_FTYPE_MOSTLY_Q4_0}, @@ -33,29 +34,6 @@ enum ggml_ftype ggml_parse_ftype(const char * str) { return ftype; } -enum ggml_type ggml_ftype_to_ggml_type(const enum ggml_ftype ftype) { - ggml_type wtype = GGML_TYPE_COUNT; - - switch (ftype) { - case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break; - case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break; - case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break; - case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break; - case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break; - case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break; - case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break; - case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break; - case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; - case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; - } - - if (wtype == GGML_TYPE_COUNT) { - fprintf(stderr, "%s: invalid model type %d\n", __func__, ftype); - } - - return wtype; -} - bool ggml_common_quantize_0( std::ifstream & finp, std::ofstream & fout, diff --git a/examples/common-ggml.h b/examples/common-ggml.h index 2eb30a34..477de341 100644 --- a/examples/common-ggml.h +++ b/examples/common-ggml.h @@ -2,31 +2,13 @@ #include "ggml.h" -#include #include #include #include -// model file types -enum ggml_ftype { - GGML_FTYPE_UNKNOWN = -1, - GGML_FTYPE_ALL_F32 = 0, - GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors - GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 - GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors - GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors - GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors - GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors -}; - -void ggml_print_ftypes(FILE * fp = stderr); - enum ggml_ftype ggml_parse_ftype(const char * str); -// TODO: temporary -enum ggml_type ggml_ftype_to_ggml_type(const enum ggml_ftype ftype); +void ggml_print_ftypes(FILE * fp = stderr); bool ggml_common_quantize_0( std::ifstream & finp, diff --git a/examples/whisper/main.cpp b/examples/whisper/main.cpp index 7b2885c7..3e8c5aaa 100644 --- a/examples/whisper/main.cpp +++ b/examples/whisper/main.cpp @@ -57,7 +57,7 @@ struct whisper_params { int32_t duration_ms = 0; int32_t max_context = -1; int32_t max_len = 0; - int32_t best_of = 5; + int32_t best_of = 2; int32_t beam_size = -1; float word_thold = 0.01f; @@ -75,6 +75,7 @@ struct whisper_params { bool output_wts = false; bool output_csv = false; bool output_jsn = false; + bool output_lrc = false; bool print_special = false; bool print_colors = false; bool print_progress = false; @@ -130,6 +131,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; } else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; } else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; } + else if (arg == "-olrc" || arg == "--output-lrc") { params.output_lrc = true; } else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; } else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; } else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; } @@ -178,6 +180,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -otxt, --output-txt [%-7s] output result in a text file\n", params.output_txt ? "true" : "false"); fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false"); fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false"); + fprintf(stderr, " -olrc, --output-lrc [%-7s] output result in a lrc file\n", params.output_lrc ? "true" : "false"); fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false"); fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str()); fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false"); @@ -208,8 +211,8 @@ void whisper_print_segment_callback(struct whisper_context * ctx, struct whisper std::string speaker = ""; - int64_t t0; - int64_t t1; + int64_t t0 = 0; + int64_t t1 = 0; // print the last n_new segments const int s0 = n_segments - n_new; @@ -349,30 +352,7 @@ bool output_srt(struct whisper_context * ctx, const char * fname, const whisper_ return true; } -bool output_csv(struct whisper_context * ctx, const char * fname) { - std::ofstream fout(fname); - if (!fout.is_open()) { - fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname); - return false; - } - - fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname); - - const int n_segments = whisper_full_n_segments(ctx); - fout << "start,end,text\n"; - for (int i = 0; i < n_segments; ++i) { - const char * text = whisper_full_get_segment_text(ctx, i); - const int64_t t0 = whisper_full_get_segment_t0(ctx, i); - const int64_t t1 = whisper_full_get_segment_t1(ctx, i); - - //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds. - fout << 10 * t0 << "," << 10 * t1 << ",\"" << text << "\"\n"; - } - - return true; -} - -char *escape_double_quotes(const char *str) { +char *escape_double_quotes_and_backslashes(const char *str) { if (str == NULL) { return NULL; } @@ -380,7 +360,7 @@ char *escape_double_quotes(const char *str) { size_t escaped_length = strlen(str) + 1; for (size_t i = 0; str[i] != '\0'; i++) { - if (str[i] == '"') { + if (str[i] == '"' || str[i] == '\\') { escaped_length++; } } @@ -392,12 +372,10 @@ char *escape_double_quotes(const char *str) { size_t pos = 0; for (size_t i = 0; str[i] != '\0'; i++) { - if (str[i] == '"') { + if (str[i] == '"' || str[i] == '\\') { escaped[pos++] = '\\'; - escaped[pos++] = '"'; - } else { - escaped[pos++] = str[i]; } + escaped[pos++] = str[i]; } // no need to set zero due to calloc() being used prior @@ -405,6 +383,30 @@ char *escape_double_quotes(const char *str) { return escaped; } +bool output_csv(struct whisper_context * ctx, const char * fname) { + std::ofstream fout(fname); + if (!fout.is_open()) { + fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname); + return false; + } + + fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname); + + const int n_segments = whisper_full_n_segments(ctx); + fout << "start,end,text\n"; + for (int i = 0; i < n_segments; ++i) { + const char * text = whisper_full_get_segment_text(ctx, i); + const int64_t t0 = whisper_full_get_segment_t0(ctx, i); + const int64_t t1 = whisper_full_get_segment_t1(ctx, i); + char * text_escaped = escape_double_quotes_and_backslashes(text); + + //need to multiply times returned from whisper_full_get_segment_t{0,1}() by 10 to get milliseconds. + fout << 10 * t0 << "," << 10 * t1 << ",\"" << text_escaped << "\"\n"; + } + + return true; +} + bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) { std::ofstream fout(fname); int indent = 0; @@ -448,7 +450,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper auto value_s = [&](const char *name, const char *val, bool end = false) { start_value(name); - char * val_escaped = escape_double_quotes(val); + char * val_escaped = escape_double_quotes_and_backslashes(val); fout << "\"" << val_escaped << (end ? "\"\n" : "\",\n"); free(val_escaped); }; @@ -494,7 +496,7 @@ bool output_json(struct whisper_context * ctx, const char * fname, const whisper value_i("layer", whisper_model_n_text_layer(ctx), true); end_obj(); value_i("mels", whisper_model_n_mels(ctx)); - value_i("f16", whisper_model_f16(ctx), true); + value_i("ftype", whisper_model_ftype(ctx), true); end_obj(); start_obj("params"); value_s("model", params.model.c_str()); @@ -647,6 +649,39 @@ bool output_wts(struct whisper_context * ctx, const char * fname, const char * f return true; } +bool output_lrc(struct whisper_context * ctx, const char * fname) { + + std::ofstream fout(fname); + if (!fout.is_open()) { + fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname); + return false; + } + + fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname); + + fout << "[by:whisper.cpp]\n"; + + const int n_segments = whisper_full_n_segments(ctx); + for (int i = 0; i < n_segments; ++i) { + const char * text = whisper_full_get_segment_text(ctx, i); + const int64_t t = whisper_full_get_segment_t0(ctx, i); + + int64_t msec = t * 10; + int64_t min = msec / (1000 * 60); + msec = msec - min * (1000 * 60); + int64_t sec = msec / 1000; + msec = msec - sec * 1000; + + char buf[16]; + snprintf(buf, sizeof(buf), "%02d:%02d.%02d", (int) min, (int) sec, (int) ( msec / 10)); + std::string timestamp_lrc = std::string(buf); + + fout << '[' << timestamp_lrc << ']' << text << "\n"; + } + + return true; +} + int main(int argc, char ** argv) { whisper_params params; @@ -813,6 +848,12 @@ int main(int argc, char ** argv) { const auto fname_jsn = fname_out + ".json"; output_json(ctx, fname_jsn.c_str(), params); } + + // output to LRC file + if (params.output_lrc) { + const auto fname_lrc = fname_out + ".lrc"; + output_lrc(ctx, fname_lrc.c_str()); + } } } diff --git a/examples/whisper/quantize.cpp b/examples/whisper/quantize.cpp index 994fabd4..dc20e4ea 100644 --- a/examples/whisper/quantize.cpp +++ b/examples/whisper/quantize.cpp @@ -1,4 +1,4 @@ -#include "ggml/ggml.h" +#include "ggml.h" #include "common.h" #include "common-ggml.h" @@ -170,10 +170,7 @@ bool whisper_model_quantize(const std::string & fname_inp, const std::string & f int main(int argc, char ** argv) { if (argc != 4) { fprintf(stderr, "usage: %s model-f32.bin model-quant.bin type\n", argv[0]); - fprintf(stderr, " type = 2 - q4_0\n"); - fprintf(stderr, " type = 3 - q4_1\n"); - fprintf(stderr, " type = 5 - q4_2\n"); - fprintf(stderr, " type = 6 - q4_3\n"); + ggml_print_ftypes(stderr); return 1; } @@ -187,7 +184,7 @@ int main(int argc, char ** argv) { const std::string fname_inp = argv[1]; const std::string fname_out = argv[2]; - const int ftype = atoi(argv[3]); + const ggml_ftype ftype = ggml_parse_ftype(argv[3]); const int64_t t_main_start_us = ggml_time_us(); diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp index 24b9e5d8..4f83dcd8 100644 --- a/examples/whisper/whisper.cpp +++ b/examples/whisper/whisper.cpp @@ -1,5 +1,7 @@ -#define WHISPER_BUILD #include "whisper.h" +#if WHISPER_USE_COREML +#include "coreml/whisper-encoder.h" +#endif #include "ggml.h" @@ -99,7 +101,7 @@ static void byteswap_tensor(ggml_tensor * tensor) { #define WHISPER_PRINT_DEBUG(...) #endif -#define WHISPER_USE_FLASH_ATTN +//#define WHISPER_USE_FLASH_ATTN //#define WHISPER_USE_FLASH_FF #define WHISPER_MAX_DECODERS 16 @@ -221,11 +223,11 @@ static const std::map> g_lang = { static const size_t MB = 1ull*1024*1024; static const std::map MEM_REQ_SCRATCH0 = { - { MODEL_TINY, 14ull*MB }, - { MODEL_BASE, 18ull*MB }, - { MODEL_SMALL, 28ull*MB }, - { MODEL_MEDIUM, 36ull*MB }, - { MODEL_LARGE, 44ull*MB }, + { MODEL_TINY, 62ull*MB }, + { MODEL_BASE, 80ull*MB }, + { MODEL_SMALL, 120ull*MB }, + { MODEL_MEDIUM, 158ull*MB }, + { MODEL_LARGE, 198ull*MB }, }; static const std::map MEM_REQ_SCRATCH1 = { @@ -252,12 +254,70 @@ static const std::map MEM_REQ_SCRATCH3 = { { MODEL_LARGE, 9ull*MB }, }; -static const std::map MEM_REQ_MODEL = { - { MODEL_TINY, 74ull*MB }, - { MODEL_BASE, 142ull*MB }, - { MODEL_SMALL, 466ull*MB }, - { MODEL_MEDIUM, 1464ull*MB }, - { MODEL_LARGE, 2952ull*MB }, +static const std::map> MEM_REQ_MODEL = { + { GGML_TYPE_F32, + { + { MODEL_TINY, 74ull*MB }, + { MODEL_BASE, 142ull*MB }, + { MODEL_SMALL, 466ull*MB }, + { MODEL_MEDIUM, 1464ull*MB }, + { MODEL_LARGE, 2952ull*MB }, + }, + }, + { GGML_TYPE_F16, + { + { MODEL_TINY, 74ull*MB }, + { MODEL_BASE, 142ull*MB }, + { MODEL_SMALL, 466ull*MB }, + { MODEL_MEDIUM, 1464ull*MB }, + { MODEL_LARGE, 2952ull*MB }, + }, + }, + { GGML_TYPE_Q4_0, + { + { MODEL_TINY, 26ull*MB }, + { MODEL_BASE, 50ull*MB }, + { MODEL_SMALL, 154ull*MB }, + { MODEL_MEDIUM, 470ull*MB }, + { MODEL_LARGE, 940ull*MB }, + }, + }, + { GGML_TYPE_Q4_1, + { + { MODEL_TINY, 31ull*MB }, + { MODEL_BASE, 57ull*MB }, + { MODEL_SMALL, 181ull*MB }, + { MODEL_MEDIUM, 559ull*MB }, + { MODEL_LARGE, 1122ull*MB }, + }, + }, + { GGML_TYPE_Q4_2, + { + { MODEL_TINY, 26ull*MB }, + { MODEL_BASE, 50ull*MB }, + { MODEL_SMALL, 154ull*MB }, + { MODEL_MEDIUM, 470ull*MB }, + { MODEL_LARGE, 940ull*MB }, + }, + }, + { GGML_TYPE_Q5_0, // TODO: fix + { + { MODEL_TINY, 31ull*MB }, + { MODEL_BASE, 57ull*MB }, + { MODEL_SMALL, 181ull*MB }, + { MODEL_MEDIUM, 559ull*MB }, + { MODEL_LARGE, 1122ull*MB }, + }, + }, + { GGML_TYPE_Q5_1, + { + { MODEL_TINY, 31ull*MB }, + { MODEL_BASE, 57ull*MB }, + { MODEL_SMALL, 181ull*MB }, + { MODEL_MEDIUM, 559ull*MB }, + { MODEL_LARGE, 1122ull*MB }, + }, + }, }; static const std::map MEM_REQ_KV_SELF = { @@ -277,11 +337,11 @@ static const std::map MEM_REQ_KV_CROSS = { }; static const std::map MEM_REQ_ENCODE = { - { MODEL_TINY, 6ull*MB }, - { MODEL_BASE, 8ull*MB }, - { MODEL_SMALL, 13ull*MB }, - { MODEL_MEDIUM, 22ull*MB }, - { MODEL_LARGE, 33ull*MB }, + { MODEL_TINY, 30ull*MB }, + { MODEL_BASE, 38ull*MB }, + { MODEL_SMALL, 56ull*MB }, + { MODEL_MEDIUM, 74ull*MB }, + { MODEL_LARGE, 94ull*MB }, }; static const std::map MEM_REQ_DECODE = { @@ -294,6 +354,7 @@ static const std::map MEM_REQ_DECODE = { struct whisper_mel { int n_len; + int n_len_org; int n_mel; std::vector data; @@ -366,7 +427,7 @@ struct whisper_hparams { int32_t n_text_head = 6; int32_t n_text_layer = 4; int32_t n_mels = 80; - int32_t f16 = 1; + int32_t ftype = 1; }; // audio encoding layer @@ -586,6 +647,11 @@ struct whisper_state { int lang_id = 0; // english by default + std::string path_model; // populated by whisper_init_from_file() +#ifdef WHISPER_USE_COREML + whisper_coreml_context * ctx_coreml = nullptr; +#endif + // [EXPERIMENTAL] token-level timestamps data int64_t t_beg = 0; int64_t t_last = 0; @@ -628,10 +694,11 @@ struct whisper_state { }; struct whisper_context { - int64_t t_load_us = 0; + int64_t t_load_us = 0; int64_t t_start_us = 0; - ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 or FP16) + ggml_type wtype = ggml_type::GGML_TYPE_F16; // weight type (FP32 / FP16 / QX) + ggml_type itype = ggml_type::GGML_TYPE_F16; // intermediate type (FP32 or FP16) whisper_model model; whisper_vocab vocab; @@ -688,7 +755,7 @@ static bool kv_cache_reinit(struct whisper_kv_cache & cache) { const ggml_type wtype = cache.k->type; WHISPER_ASSERT(wtype == cache.v->type); - WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_size(wtype)); + WHISPER_ASSERT(cache.buf.size() >= 2*n_elements*ggml_type_sizef(wtype)); struct ggml_init_params params = { /*.mem_size =*/ cache.buf.size(), @@ -761,7 +828,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con read_safe(loader, hparams.n_text_head); read_safe(loader, hparams.n_text_layer); read_safe(loader, hparams.n_mels); - read_safe(loader, hparams.f16); + read_safe(loader, hparams.ftype); assert(hparams.n_text_state == hparams.n_audio_state); @@ -785,11 +852,15 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con model.type = e_model::MODEL_LARGE; } - // for the big tensors, we have the option to store the data in 16-bit floats + // for the big tensors, we have the option to store the data in 16-bit floats or quantized // in order to save memory and also to speed up the computation - wctx.wtype = model.hparams.f16 ? GGML_TYPE_F16 : GGML_TYPE_F32; + wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wctx.wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype); + return false; + } - const size_t scale = model.hparams.f16 ? 1 : 2; + const size_t scale = model.hparams.ftype ? 1 : 2; fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab); fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx); @@ -801,18 +872,18 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head); fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer); fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels); - fprintf(stderr, "%s: f16 = %d\n", __func__, hparams.f16); + fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype); fprintf(stderr, "%s: type = %d\n", __func__, model.type); // print memory requirements { // this is the total memory required to run the inference const size_t mem_required = - MEM_REQ_SCRATCH0.at (model.type) + - MEM_REQ_SCRATCH1.at (model.type) + - MEM_REQ_SCRATCH2.at (model.type) + - MEM_REQ_SCRATCH3.at (model.type) + - scale*MEM_REQ_MODEL.at (model.type) + + MEM_REQ_SCRATCH0.at(model.type) + + MEM_REQ_SCRATCH1.at(model.type) + + MEM_REQ_SCRATCH2.at(model.type) + + MEM_REQ_SCRATCH3.at(model.type) + + scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type) + scale*MEM_REQ_KV_CROSS.at(model.type) + scale*std::max(MEM_REQ_ENCODE.at(model.type), MEM_REQ_DECODE.at(model.type)); @@ -828,7 +899,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con // always have at least one decoder wctx.model.buf = new std::vector(); - wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(model.type)); + wctx.model.buf->resize(scale*MEM_REQ_MODEL.at(wctx.wtype).at(model.type)); // we skip initialization of the state until it is needed // because it might be that state will always be provided externally. @@ -919,6 +990,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con size_t ctx_size = 0; const ggml_type wtype = wctx.wtype; + const ggml_type vtype = wctx.wtype == GGML_TYPE_F32 ? GGML_TYPE_F32 : GGML_TYPE_F16; // conv type { const auto & hparams = model.hparams; @@ -937,92 +1009,92 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con // encoder { - ctx_size += n_audio_ctx*n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_pe; + ctx_size += n_audio_ctx*n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_pe; - ctx_size += 3*n_mels*n_audio_state*ggml_type_size(wtype); // e_conv_1_w - ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_1_b + ctx_size += 3*n_mels*n_audio_state*ggml_type_sizef(vtype); // e_conv_1_w + ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_1_b - ctx_size += 3*n_audio_state*n_audio_state*ggml_type_size(wtype); // e_conv_2_w - ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_conv_2_b + ctx_size += 3*n_audio_state*n_audio_state*ggml_type_sizef(vtype); // e_conv_2_w + ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_conv_2_b - ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_w; - ctx_size += n_audio_state*ggml_type_size(GGML_TYPE_F32); // e_ln_b; + ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_w; + ctx_size += n_audio_state*ggml_type_sizef(GGML_TYPE_F32); // e_ln_b; } // decoder { - ctx_size += n_text_ctx*n_text_state*ggml_type_size(GGML_TYPE_F32); // d_pe; + ctx_size += n_text_ctx*n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_pe; - ctx_size += n_vocab*n_text_state*ggml_type_size(wtype); // d_te; + ctx_size += n_vocab*n_text_state*ggml_type_sizef(wtype); // d_te; - ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_w; - ctx_size += n_text_state*ggml_type_size(GGML_TYPE_F32); // d_ln_b; + ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_w; + ctx_size += n_text_state*ggml_type_sizef(GGML_TYPE_F32); // d_ln_b; } // encoder layers { - ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w - ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b + ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w + ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b - ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype)); // mlp_0_w - ctx_size += n_audio_layer*( 4*n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b + ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // mlp_0_w + ctx_size += n_audio_layer*( 4*n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b - ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_size(wtype)); // mlp_1_w - ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b + ctx_size += n_audio_layer*(4*n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // mlp_1_w + ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b - ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w - ctx_size += n_audio_layer*(n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b + ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w + ctx_size += n_audio_layer*(n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b - ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_q_w - ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b + ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_q_w + ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b - ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_k_w + ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_k_w - ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_v_w - ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b + ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_v_w + ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b - ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_size(wtype)); // attn_ln_1_w - ctx_size += n_audio_layer*( n_audio_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b + ctx_size += n_audio_layer*(n_audio_state*n_audio_state*ggml_type_sizef(wtype)); // attn_ln_1_w + ctx_size += n_audio_layer*( n_audio_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b } // decoder layers { - ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_w - ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_ln_b + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_w + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_ln_b - ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype)); // mlp_0_w - ctx_size += n_text_layer*( 4*n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_0_b + ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype)); // mlp_0_w + ctx_size += n_text_layer*( 4*n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_0_b - ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_size(wtype)); // mlp_1_w - ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // mlp_1_b + ctx_size += n_text_layer*(4*n_text_state*n_text_state*ggml_type_sizef(wtype)); // mlp_1_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // mlp_1_b - ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_w - ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_0_b + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_w + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_0_b - ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_q_w - ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_q_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_q_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_q_b - ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_k_w + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_k_w - ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_v_w - ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_v_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_v_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_v_b - ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // attn_ln_1_w - ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // attn_ln_1_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // attn_ln_1_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // attn_ln_1_b // - ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_w - ctx_size += n_text_layer*(n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_0_b + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_w + ctx_size += n_text_layer*(n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_0_b - ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_q_w - ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_q_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_q_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_q_b - ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_k_w + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_k_w - ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_v_w - ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_v_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_v_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_v_b - ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_size(wtype)); // cross_attn_ln_1_w - ctx_size += n_text_layer*( n_text_state*ggml_type_size(GGML_TYPE_F32)); // cross_attn_ln_1_b + ctx_size += n_text_layer*(n_text_state*n_text_state*ggml_type_sizef(wtype)); // cross_attn_ln_1_w + ctx_size += n_text_layer*( n_text_state*ggml_type_sizef(GGML_TYPE_F32)); // cross_attn_ln_1_b } ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*256; // object overhead @@ -1068,175 +1140,175 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con // encoder { - model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx); + model.e_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_audio_state, n_audio_ctx); - model.e_conv_1_w = ggml_new_tensor_3d(ctx, wtype, 3, n_mels, n_audio_state); + model.e_conv_1_w = ggml_new_tensor_3d(ctx, vtype, 3, n_mels, n_audio_state); model.e_conv_1_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state); - model.e_conv_2_w = ggml_new_tensor_3d(ctx, wtype, 3, n_audio_state, n_audio_state); + model.e_conv_2_w = ggml_new_tensor_3d(ctx, vtype, 3, n_audio_state, n_audio_state); model.e_conv_2_b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, 1, n_audio_state); - model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); - model.e_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + model.e_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + model.e_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); // map by name model.tensors["encoder.positional_embedding"] = model.e_pe; - model.tensors["encoder.conv1.weight"] = model.e_conv_1_w; - model.tensors["encoder.conv1.bias"] = model.e_conv_1_b; + model.tensors["encoder.conv1.weight"] = model.e_conv_1_w; + model.tensors["encoder.conv1.bias"] = model.e_conv_1_b; - model.tensors["encoder.conv2.weight"] = model.e_conv_2_w; - model.tensors["encoder.conv2.bias"] = model.e_conv_2_b; + model.tensors["encoder.conv2.weight"] = model.e_conv_2_w; + model.tensors["encoder.conv2.bias"] = model.e_conv_2_b; - model.tensors["encoder.ln_post.weight"] = model.e_ln_w; - model.tensors["encoder.ln_post.bias"] = model.e_ln_b; + model.tensors["encoder.ln_post.weight"] = model.e_ln_w; + model.tensors["encoder.ln_post.bias"] = model.e_ln_b; for (int i = 0; i < n_audio_layer; ++i) { auto & layer = model.layers_encoder[i]; - layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); - layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); - layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state); - layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state); + layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, 4*n_audio_state); + layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_audio_state); - layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state); - layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_audio_state, n_audio_state); + layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); - layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); - layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); - layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); - layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); + layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); - layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); + layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); - layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); - layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); + layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); - layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); - layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); + layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_audio_state, n_audio_state); + layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_audio_state); // map by name - model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w; - model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b; + model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w; + model.tensors["encoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b; - model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w; - model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b; + model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w; + model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b; - model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w; - model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b; + model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w; + model.tensors["encoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b; - model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w; - model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b; + model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w; + model.tensors["encoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b; model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w; model.tensors["encoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b; - model.tensors["encoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w; + model.tensors["encoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w; model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w; model.tensors["encoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b; - model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w; - model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b; + model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w; + model.tensors["encoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b; } } // decoder { - model.d_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx); + model.d_pe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_text_state, n_text_ctx); - model.d_te = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab); + model.d_te = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_vocab); model.d_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); model.d_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); // map by name - model.tensors["decoder.positional_embedding"] = model.d_pe; + model.tensors["decoder.positional_embedding"] = model.d_pe; model.tensors["decoder.token_embedding.weight"] = model.d_te; - model.tensors["decoder.ln.weight"] = model.d_ln_w; - model.tensors["decoder.ln.bias"] = model.d_ln_b; + model.tensors["decoder.ln.weight"] = model.d_ln_w; + model.tensors["decoder.ln.bias"] = model.d_ln_b; for (int i = 0; i < n_text_layer; ++i) { auto & layer = model.layers_decoder[i]; - layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.mlp_ln_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.mlp_ln_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state); - layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state); + layer.mlp_0_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, 4*n_text_state); + layer.mlp_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_text_state); - layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state); - layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.mlp_1_w = ggml_new_tensor_2d(ctx, wtype, 4*n_text_state, n_text_state); + layer.mlp_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); - layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); - layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); - layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); - layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.cross_attn_ln_0_w = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.cross_attn_ln_0_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.cross_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); - layer.cross_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.cross_attn_q_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.cross_attn_q_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.cross_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.cross_attn_k_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); - layer.cross_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); - layer.cross_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.cross_attn_v_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.cross_attn_v_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); - layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); - layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); + layer.cross_attn_ln_1_w = ggml_new_tensor_2d(ctx, wtype, n_text_state, n_text_state); + layer.cross_attn_ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_text_state); // map by name - model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w; - model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b; + model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.weight"] = layer.mlp_ln_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".mlp_ln.bias"] = layer.mlp_ln_b; - model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w; - model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b; + model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.weight"] = layer.mlp_0_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.0.bias"] = layer.mlp_0_b; - model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w; - model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b; + model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.weight"] = layer.mlp_1_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".mlp.2.bias"] = layer.mlp_1_b; - model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w; - model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b; + model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.weight"] = layer.attn_ln_0_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".attn_ln.bias"] = layer.attn_ln_0_b; - model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w; - model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b; + model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.weight"] = layer.attn_q_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".attn.query.bias"] = layer.attn_q_b; - model.tensors["decoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".attn.key.weight"] = layer.attn_k_w; - model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w; - model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b; + model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.weight"] = layer.attn_v_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".attn.value.bias"] = layer.attn_v_b; - model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w; - model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b; + model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.weight"] = layer.attn_ln_1_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".attn.out.bias"] = layer.attn_ln_1_b; - model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.weight"] = layer.cross_attn_ln_0_w; - model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.bias"] = layer.cross_attn_ln_0_b; + model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.weight"] = layer.cross_attn_ln_0_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn_ln.bias"] = layer.cross_attn_ln_0_b; model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.weight"] = layer.cross_attn_q_w; model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.query.bias"] = layer.cross_attn_q_b; - model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.key.weight"] = layer.cross_attn_k_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.key.weight"] = layer.cross_attn_k_w; model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.weight"] = layer.cross_attn_v_w; model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.value.bias"] = layer.cross_attn_v_b; - model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.weight"] = layer.cross_attn_ln_1_w; - model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.bias"] = layer.cross_attn_ln_1_b; + model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.weight"] = layer.cross_attn_ln_1_w; + model.tensors["decoder.blocks." + std::to_string(i) + ".cross_attn.out.bias"] = layer.cross_attn_ln_1_b; } } } @@ -1250,22 +1322,20 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con while (true) { int32_t n_dims; int32_t length; - int32_t ftype; + int32_t ttype; read_safe(loader, n_dims); read_safe(loader, length); - read_safe(loader, ftype); + read_safe(loader, ttype); if (loader->eof(loader->context)) { break; } - int64_t nelements = 1; - int64_t ne[3] = { 1, 1, 1 }; + int32_t nelements = 1; + int32_t ne[3] = { 1, 1, 1 }; for (int i = 0; i < n_dims; ++i) { - int32_t ne_cur; - read_safe(loader, ne_cur); - ne[i] = ne_cur; + read_safe(loader, ne[i]); nelements *= ne[i]; } @@ -1286,15 +1356,15 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con } if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%lld, %lld, %lld], expected [%lld, %lld, %lld]\n", - __func__, name.data(), tensor->ne[0], tensor->ne[1], tensor->ne[2], ne[0], ne[1], ne[2]); + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n", + __func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], ne[0], ne[1], ne[2]); return false; } - const size_t bpe = (ftype == 0) ? sizeof(float) : sizeof(ggml_fp16_t); + const size_t bpe = ggml_type_size(ggml_type(ttype)); - if (nelements*bpe != ggml_nbytes(tensor)) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %llu\n", + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", __func__, name.data(), ggml_nbytes(tensor), nelements*bpe); return false; } @@ -1302,7 +1372,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con loader->read(loader->context, tensor->data, ggml_nbytes(tensor)); BYTESWAP_TENSOR(tensor); - //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0); + //printf("%48s - [%5d, %5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype), ggml_nbytes(tensor)/1024.0/1024.0); total_size += ggml_nbytes(tensor); model.n_loaded++; } @@ -1380,311 +1450,320 @@ static bool whisper_encode_internal( struct ggml_tensor * cur; - // convolution + gelu - { - wstate.use_buf(ctx0, 1); +#ifndef WHISPER_USE_COREML + const bool use_coreml = false; +#else + const bool use_coreml = wstate.ctx_coreml != nullptr; +#endif - cur = ggml_conv_1d_1s(ctx0, model.e_conv_1_w, mel); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, - model.e_conv_1_b, - cur), - cur); + if (!use_coreml) { + // convolution + gelu + { + wstate.use_buf(ctx0, 1); - cur = ggml_gelu(ctx0, cur); + cur = ggml_conv_1d_1s(ctx0, model.e_conv_1_w, mel); + cur = ggml_add(ctx0, + ggml_repeat(ctx0, + model.e_conv_1_b, + cur), + cur); - wstate.use_buf(ctx0, 0); + cur = ggml_gelu(ctx0, cur); - cur = ggml_conv_1d_2s(ctx0, model.e_conv_2_w, cur); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, - model.e_conv_2_b, - cur), - cur); + wstate.use_buf(ctx0, 0); - cur = ggml_gelu(ctx0, cur); - } + cur = ggml_conv_1d_2s(ctx0, model.e_conv_2_w, cur); + cur = ggml_add(ctx0, + ggml_repeat(ctx0, + model.e_conv_2_b, + cur), + cur); - wstate.use_buf(ctx0, 3); + cur = ggml_gelu(ctx0, cur); + } - // =================================================================== - // NOTE: experimenting with partial evaluation of the encoder (ignore) - //static int iter = -1; - //const int n_iter = 1500/n_ctx; + wstate.use_buf(ctx0, 3); - //iter = (iter + 1) % n_iter; + // =================================================================== + // NOTE: experimenting with partial evaluation of the encoder (ignore) + //static int iter = -1; + //const int n_iter = 1500/n_ctx; - //if (iter == 0) { - // memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k)); - // memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v)); - //} + //iter = (iter + 1) % n_iter; - static int iter = 0; + //if (iter == 0) { + // memset(model.memory_cross_k->data, 0, ggml_nbytes(model.memory_cross_k)); + // memset(model.memory_cross_v->data, 0, ggml_nbytes(model.memory_cross_v)); + //} - const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe); - const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter; + static int iter = 0; - struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset); + const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe); + const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter; - cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur)); + struct ggml_tensor * e_pe = ggml_view_2d(ctx0, model.e_pe, model.e_pe->ne[0], n_ctx, e_pe_stride, e_pe_offset); - // =================================================================== + cur = ggml_add(ctx0, e_pe, ggml_transpose(ctx0, cur)); - // original: - //cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur)); + // =================================================================== - struct ggml_tensor * inpL = cur; + // original: + //cur = ggml_add(ctx0, model.e_pe, ggml_transpose(ctx0, cur)); - for (int il = 0; il < n_layer; ++il) { - const auto & layer = model.layers_encoder[il]; + struct ggml_tensor * inpL = cur; - // norm - { - wstate.use_buf(ctx0, 0); + for (int il = 0; il < n_layer; ++il) { + const auto & layer = model.layers_encoder[il]; - cur = ggml_norm(ctx0, inpL); + // norm + { + wstate.use_buf(ctx0, 0); - // cur = ln_0_w*cur + ln_0_b - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, layer.attn_ln_0_w, cur), - cur), - ggml_repeat(ctx0, layer.attn_ln_0_b, cur)); - } + cur = ggml_norm(ctx0, inpL); - // self-attention - { - wstate.use_buf(ctx0, 1); + // cur = ln_0_w*cur + ln_0_b + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, layer.attn_ln_0_w, cur), + cur), + ggml_repeat(ctx0, layer.attn_ln_0_b, cur)); + } - struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, - layer.attn_q_w, - cur); + // self-attention + { + wstate.use_buf(ctx0, 1); - Qcur = ggml_add(ctx0, - ggml_repeat(ctx0, - layer.attn_q_b, - Qcur), - Qcur); + struct ggml_tensor * Qcur = ggml_mul_mat(ctx0, + layer.attn_q_w, + cur); - //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + Qcur = ggml_add(ctx0, + ggml_repeat(ctx0, + layer.attn_q_b, + Qcur), + Qcur); - // note: no bias for Key - struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, - layer.attn_k_w, - cur); + //Qcur = ggml_scale(ctx0, Qcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); - //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); + // note: no bias for Key + struct ggml_tensor * Kcur = ggml_mul_mat(ctx0, + layer.attn_k_w, + cur); - struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, - layer.attn_v_w, - cur); + //Kcur = ggml_scale(ctx0, Kcur, ggml_new_f32(ctx0, pow(float(n_state)/n_head, -0.25))); - Vcur = ggml_add(ctx0, - ggml_repeat(ctx0, - layer.attn_v_b, - Vcur), - Vcur); + struct ggml_tensor * Vcur = ggml_mul_mat(ctx0, + layer.attn_v_w, + cur); - // ------ + Vcur = ggml_add(ctx0, + ggml_repeat(ctx0, + layer.attn_v_b, + Vcur), + Vcur); - wstate.use_buf(ctx0, 0); + // ------ + + wstate.use_buf(ctx0, 0); #ifdef WHISPER_USE_FLASH_ATTN - struct ggml_tensor * Q = - ggml_permute(ctx0, - ggml_cpy(ctx0, - Qcur, - ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)), - 0, 2, 1, 3); + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Qcur, + ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)), + 0, 2, 1, 3); + + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Kcur, + ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)), + 0, 2, 1, 3); + + struct ggml_tensor * V = + ggml_cpy(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + Vcur, + n_state/n_head, n_head, n_ctx), + 1, 2, 0, 3), + ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head)); + + struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false); +#else + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Qcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)), + 0, 2, 1, 3); + + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Kcur, + ggml_new_tensor_3d(ctx0, wctx.itype, n_state/n_head, n_head, n_ctx)), + 0, 2, 1, 3); + + // K * Q + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + + struct ggml_tensor * KQ_scaled = + ggml_scale(ctx0, + KQ, + ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head)) + ); + + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled); + + struct ggml_tensor * V = + ggml_cpy(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + Vcur, + n_state/n_head, n_head, n_ctx), + 1, 2, 0, 3), + ggml_new_tensor_3d(ctx0, wctx.itype, n_ctx, n_state/n_head, n_head) + ); + + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max); +#endif + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_cpy(ctx0, - Kcur, - ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)), - 0, 2, 1, 3); + wstate.use_buf(ctx0, 1); - struct ggml_tensor * V = - ggml_cpy(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - Vcur, - n_state/n_head, n_head, n_ctx), - 1, 2, 0, 3), - ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head)); - - struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, false); -#else - struct ggml_tensor * Q = - ggml_permute(ctx0, - ggml_cpy(ctx0, - Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_state/n_head, n_head, n_ctx)), - 0, 2, 1, 3); + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx)); + } - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_cpy(ctx0, - Kcur, - ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)), - 0, 2, 1, 3); + // projection + { + wstate.use_buf(ctx0, 0); - // K * Q - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + cur = ggml_mul_mat(ctx0, + layer.attn_ln_1_w, + cur); - struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, - KQ, - ggml_new_f32(ctx0, 1.0f/sqrt(float(n_state)/n_head)) - ); + wstate.use_buf(ctx0, 1); - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled); + cur = ggml_add(ctx0, + ggml_repeat(ctx0, layer.attn_ln_1_b, cur), + cur); + } - //struct ggml_tensor * V_trans = - // ggml_permute(ctx0, - // ggml_cpy(ctx0, - // Vcur, - // ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)), - // 1, 2, 0, 3); + wstate.use_buf(ctx0, 2); - //struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); + // add the input + cur = ggml_add(ctx0, cur, inpL); - struct ggml_tensor * V = - ggml_cpy(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - Vcur, - n_state/n_head, n_head, n_ctx), - 0, 2, 1, 3), - ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_ctx, n_head) - ); - - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, ggml_transpose(ctx0, V), KQ_soft_max); -#endif - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + struct ggml_tensor * inpFF = cur; - wstate.use_buf(ctx0, 1); + // feed-forward network + { + // norm + { + wstate.use_buf(ctx0, 0); - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx)); - } + cur = ggml_norm(ctx0, inpFF); - // projection - { - wstate.use_buf(ctx0, 0); + wstate.use_buf(ctx0, 1); - cur = ggml_mul_mat(ctx0, - layer.attn_ln_1_w, - cur); + // cur = mlp_ln_w*cur + mlp_ln_b + cur = ggml_add(ctx0, + ggml_mul(ctx0, + ggml_repeat(ctx0, layer.mlp_ln_w, cur), + cur), + ggml_repeat(ctx0, layer.mlp_ln_b, cur)); + } - wstate.use_buf(ctx0, 1); +#ifdef WHISPER_USE_FLASH_FF + wstate.use_buf(ctx0, 0); - cur = ggml_add(ctx0, - ggml_repeat(ctx0, layer.attn_ln_1_b, cur), - cur); - } + cur = ggml_flash_ff(ctx0, + ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.itype, n_state, n_ctx)), + layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b); +#else + wstate.use_buf(ctx0, 0); - wstate.use_buf(ctx0, 2); + // fully connected + cur = ggml_mul_mat(ctx0, + layer.mlp_0_w, + cur); - // add the input - cur = ggml_add(ctx0, cur, inpL); + wstate.use_buf(ctx0, 1); - struct ggml_tensor * inpFF = cur; + cur = ggml_add(ctx0, + ggml_repeat(ctx0, layer.mlp_0_b, cur), + cur); - // feed-forward network - { - // norm - { wstate.use_buf(ctx0, 0); - cur = ggml_norm(ctx0, inpFF); + // GELU activation + cur = ggml_gelu(ctx0, cur); wstate.use_buf(ctx0, 1); - // cur = mlp_ln_w*cur + mlp_ln_b - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, layer.mlp_ln_w, cur), - cur), - ggml_repeat(ctx0, layer.mlp_ln_b, cur)); - } + // projection + cur = ggml_mul_mat(ctx0, + layer.mlp_1_w, + cur); -#ifdef WHISPER_USE_FLASH_FF - wstate.use_buf(ctx0, 0); + wstate.use_buf(ctx0, 0); - cur = ggml_flash_ff(ctx0, - ggml_cpy(ctx0, cur, ggml_new_tensor_2d(ctx0, wstate.wtype, n_state, n_ctx)), - layer.mlp_0_w, layer.mlp_0_b, layer.mlp_1_w, layer.mlp_1_b); -#else - wstate.use_buf(ctx0, 0); + cur = ggml_add(ctx0, + ggml_repeat(ctx0, layer.mlp_1_b, cur), + cur); +#endif + } - // fully connected - cur = ggml_mul_mat(ctx0, - layer.mlp_0_w, - cur); + wstate.use_buf(ctx0, 3); - wstate.use_buf(ctx0, 1); + inpL = ggml_add(ctx0, cur, inpFF); + } - cur = ggml_add(ctx0, - ggml_repeat(ctx0, layer.mlp_0_b, cur), - cur); + cur = inpL; + // norm + { wstate.use_buf(ctx0, 0); - // GELU activation - cur = ggml_gelu(ctx0, cur); + cur = ggml_norm(ctx0, cur); wstate.use_buf(ctx0, 1); - // projection - cur = ggml_mul_mat(ctx0, - layer.mlp_1_w, - cur); - - wstate.use_buf(ctx0, 0); - + // cur = ln_f_g*cur + ln_f_b cur = ggml_add(ctx0, - ggml_repeat(ctx0, layer.mlp_1_b, cur), - cur); -#endif + ggml_mul(ctx0, + ggml_repeat(ctx0, model.e_ln_w, cur), + cur), + ggml_repeat(ctx0, model.e_ln_b, cur)); } - wstate.use_buf(ctx0, 3); + wstate.use_buf(ctx0, -1); - inpL = ggml_add(ctx0, cur, inpFF); - } - - cur = inpL; - - // norm - { - wstate.use_buf(ctx0, 0); - - cur = ggml_norm(ctx0, cur); + // run the computation + { + struct ggml_cgraph gf = {}; + gf.n_threads = n_threads; - wstate.use_buf(ctx0, 1); + ggml_build_forward_expand(&gf, cur); + ggml_graph_compute(ctx0, &gf); - // cur = ln_f_g*cur + ln_f_b - cur = ggml_add(ctx0, - ggml_mul(ctx0, - ggml_repeat(ctx0, model.e_ln_w, cur), - cur), - ggml_repeat(ctx0, model.e_ln_b, cur)); + //ggml_graph_print(&gf); + } } - - wstate.use_buf(ctx0, -1); - - // run the computation +#ifdef WHISPER_USE_COREML + else { - struct ggml_cgraph gf = {}; - gf.n_threads = n_threads; + wstate.use_buf(ctx0, -1); - ggml_build_forward_expand(&gf, cur); - ggml_graph_compute(ctx0, &gf); + cur = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_state, n_ctx); - //ggml_graph_print(&gf); + whisper_coreml_encode(wstate.ctx_coreml, (float *) mel->data, (float *) cur->data); } +#endif // cur //{ @@ -2286,6 +2365,68 @@ static void fft(const std::vector & in, std::vector & out) { } } +static void log_mel_spectrogram_worker_thread(int ith, const std::vector &hann, const float *samples, + int n_samples, int fft_size, int fft_step, int n_threads, + const whisper_filters &filters, bool speed_up, whisper_mel &mel) { + std::vector fft_in(fft_size, 0.0); + std::vector fft_out(2 * fft_size); + int n_fft = 1 + (speed_up ? fft_size / 4 : fft_size / 2); + + for (int i = ith; i < mel.n_len; i += n_threads) { + const int offset = i * fft_step; + + // apply Hanning window + for (int j = 0; j < fft_size; j++) { + if (offset + j < n_samples) { + fft_in[j] = hann[j] * samples[offset + j]; + } else { + fft_in[j] = 0.0; + } + } + + // FFT -> mag^2 + fft(fft_in, fft_out); + + for (int j = 0; j < fft_size; j++) { + fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]); + } + for (int j = 1; j < fft_size / 2; j++) { + fft_out[j] += fft_out[fft_size - j]; + } + + if (speed_up) { + // scale down in the frequency domain results in a speed up in the time domain + for (int j = 0; j < n_fft; j++) { + fft_out[j] = 0.5 * (fft_out[2 * j] + fft_out[2 * j + 1]); + } + } + + // mel spectrogram + for (int j = 0; j < mel.n_mel; j++) { + double sum = 0.0; + + // unroll loop (suggested by GH user @lunixbochs) + int k = 0; + for (k = 0; k < n_fft - 3; k += 4) { + sum += + fft_out[k + 0] * filters.data[j*n_fft + k + 0] + + fft_out[k + 1] * filters.data[j*n_fft + k + 1] + + fft_out[k + 2] * filters.data[j*n_fft + k + 2] + + fft_out[k + 3] * filters.data[j*n_fft + k + 3]; + } + + // handle n_fft remainder + for (; k < n_fft; k++) { + sum += fft_out[k] * filters.data[j * n_fft + k]; + } + + sum = log10(std::max(sum, 1e-10)); + + mel.data[j * mel.n_len + i] = sum; + } + } +} + // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124 static bool log_mel_spectrogram( whisper_state & wstate, @@ -2308,85 +2449,48 @@ static bool log_mel_spectrogram( hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size))); } - mel.n_mel = n_mel; - mel.n_len = (n_samples)/fft_step; - mel.data.resize(mel.n_mel*mel.n_len); - - const int n_fft = 1 + (speed_up ? fft_size/4 : fft_size/2); - - //printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len); - //printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate); - - std::vector workers(n_threads); - for (int iw = 0; iw < n_threads; ++iw) { - workers[iw] = std::thread([&](int ith) { - std::vector fft_in; - fft_in.resize(fft_size); - for (int i = 0; i < fft_size; i++) { - fft_in[i] = 0.0; - } - - std::vector fft_out; - fft_out.resize(2*fft_size); + mel.n_mel = n_mel; + mel.n_len = n_samples/fft_step; + mel.n_len_org = mel.n_len; - for (int i = ith; i < mel.n_len; i += n_threads) { - const int offset = i*fft_step; + std::vector samples_padded; - // apply Hanning window - for (int j = 0; j < fft_size; j++) { - if (offset + j < n_samples) { - fft_in[j] = hann[j]*samples[offset + j]; - } else { - fft_in[j] = 0.0; - } - } + // pad audio with at least one extra chunk of zeros + { + const int pad = (100*WHISPER_CHUNK_SIZE)/2; - // FFT -> mag^2 - fft(fft_in, fft_out); + if (mel.n_len % pad != 0) { + mel.n_len = (mel.n_len/pad + 1)*pad; + } + mel.n_len += pad; - for (int j = 0; j < fft_size; j++) { - fft_out[j] = (fft_out[2*j + 0]*fft_out[2*j + 0] + fft_out[2*j + 1]*fft_out[2*j + 1]); - } - for (int j = 1; j < fft_size/2; j++) { - //if (i == 0) { - // printf("%d: %f %f\n", j, fft_out[j], fft_out[fft_size - j]); - //} - fft_out[j] += fft_out[fft_size - j]; - } - if (i == 0) { - //for (int j = 0; j < fft_size; j++) { - // printf("%d: %e\n", j, fft_out[j]); - //} - } + samples_padded.resize(mel.n_len*fft_step); + memcpy(samples_padded.data(), samples, n_samples*sizeof(float)); + memset(samples_padded.data() + n_samples, 0, (mel.n_len*fft_step - n_samples)*sizeof(float)); - if (speed_up) { - // scale down in the frequency domain results in a speed up in the time domain - for (int j = 0; j < n_fft; j++) { - fft_out[j] = 0.5*(fft_out[2*j] + fft_out[2*j + 1]); - } - } + samples = samples_padded.data(); + } - // mel spectrogram - for (int j = 0; j < mel.n_mel; j++) { - double sum = 0.0; + mel.data.resize(mel.n_mel*mel.n_len); - for (int k = 0; k < n_fft; k++) { - sum += fft_out[k]*filters.data[j*n_fft + k]; - } - if (sum < 1e-10) { - sum = 1e-10; - } + //printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len); + //printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate); - sum = log10(sum); + { + std::vector workers(n_threads - 1); + for (int iw = 0; iw < n_threads - 1; ++iw) { + workers[iw] = std::thread( + log_mel_spectrogram_worker_thread, iw + 1, std::cref(hann), samples, + n_samples, fft_size, fft_step, n_threads, + std::cref(filters), speed_up, std::ref(mel)); + } - mel.data[j*mel.n_len + i] = sum; - } - } - }, iw); - } + // main thread + log_mel_spectrogram_worker_thread(0, hann, samples, n_samples, fft_size, fft_step, n_threads, filters, speed_up, mel); - for (int iw = 0; iw < n_threads; ++iw) { - workers[iw].join(); + for (int iw = 0; iw < n_threads - 1; ++iw) { + workers[iw].join(); + } } // clamping and normalization @@ -2410,6 +2514,8 @@ static bool log_mel_spectrogram( wstate.t_mel_us += ggml_time_us() - t_start_us; + //printf("mel.n_len() = %d, divided by 1500: %f, n_samples / fft_step: %d\n", mel.n_len, mel.n_len / 1500.0, n_samples / fft_step); + return true; } @@ -2451,25 +2557,20 @@ static std::vector tokenize(const whisper_vocab & vocab, cons int n = word.size(); while (i < n) { int j = n; + bool found = false; while (j > i) { - auto it = vocab.token_to_id.find(word.substr(i, j-i)); + auto sub = word.substr(i, j-i); + auto it = vocab.token_to_id.find(sub); if (it != vocab.token_to_id.end()) { tokens.push_back(it->second); i = j; + found = true; break; } --j; } - if (i == n) { - break; - } - if (j == i) { - auto sub = word.substr(i, 1); - if (vocab.token_to_id.find(sub) != vocab.token_to_id.end()) { - tokens.push_back(vocab.token_to_id.at(sub)); - } else { - fprintf(stderr, "%s: unknown token '%s'\n", __func__, sub.data()); - } + if (!found) { + fprintf(stderr, "unknown token \n"); ++i; } } @@ -2482,13 +2583,28 @@ static std::vector tokenize(const whisper_vocab & vocab, cons // interface implementation // +#ifdef WHISPER_USE_COREML +// replace .bin with -encoder.mlmodelc +static std::string whisper_get_coreml_path_encoder(std::string path_bin) { + auto pos = path_bin.rfind('.'); + if (pos != std::string::npos) { + path_bin = path_bin.substr(0, pos); + } + + path_bin += "-encoder.mlmodelc"; + + return path_bin; +} +#endif + struct whisper_state * whisper_init_state(whisper_context * ctx) { whisper_state * state = new whisper_state; - const size_t scale = ctx->model.hparams.f16 ? 1 : 2; + const size_t scale = ctx->model.hparams.ftype ? 1 : 2; - if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->wtype, ctx->model.hparams.n_text_ctx)) { + if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) { fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__); + delete state; return nullptr; } @@ -2497,8 +2613,9 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } - if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->wtype, ctx->model.hparams.n_audio_ctx)) { + if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) { fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__); + delete state; return nullptr; } @@ -2507,6 +2624,23 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0); } +#ifdef WHISPER_USE_COREML + const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model); + + fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str()); + fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__); + + state->ctx_coreml = whisper_coreml_init(path_coreml.c_str()); + if (!state->ctx_coreml) { + fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str()); +#ifndef WHISPER_COREML_ALLOW_FALLBACK + return nullptr; +#endif + } else { + fprintf(stderr, "%s: Core ML model loaded\n", __func__); + } +#endif + state->logits.reserve(ctx->vocab.n_vocab * ctx->model.hparams.n_text_ctx); state->logits_id.reserve(ctx->model.hparams.n_vocab); @@ -2530,7 +2664,6 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { } struct whisper_context * whisper_init_from_file_no_state(const char * path_model) { - whisper_model_loader loader = {}; fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model); @@ -2540,7 +2673,10 @@ struct whisper_context * whisper_init_from_file_no_state(const char * path_model return nullptr; } + whisper_model_loader loader = {}; + loader.context = &fin; + loader.read = [](void * ctx, void * output, size_t read_size) { std::ifstream * fin = (std::ifstream*)ctx; fin->read((char *)output, read_size); @@ -2574,10 +2710,11 @@ struct whisper_context * whisper_init_from_buffer_no_state(void * buffer, size_t }; buf_context ctx = { reinterpret_cast(buffer), buffer_size, 0 }; - whisper_model_loader loader = {}; fprintf(stderr, "%s: loading model from buffer\n", __func__); + whisper_model_loader loader = {}; + loader.context = &ctx; loader.read = [](void * ctx, void * output, size_t read_size) { @@ -2673,6 +2810,13 @@ void whisper_free_state(struct whisper_state * state) kv_cache_free(state->decoders[i].kv_self); } +#ifdef WHISPER_USE_COREML + if (state->ctx_coreml != nullptr) { + whisper_coreml_free(state->ctx_coreml); + state->ctx_coreml = nullptr; + } +#endif + delete state; } } @@ -2731,8 +2875,9 @@ int whisper_set_mel_with_state( return -1; } - state->mel.n_len = n_len; - state->mel.n_mel = n_mel; + state->mel.n_len = n_len; + state->mel.n_len_org = n_len; + state->mel.n_mel = n_mel; state->mel.data.resize(n_len*n_mel); memcpy(state->mel.data.data(), data, n_len*n_mel*sizeof(float)); @@ -2830,7 +2975,6 @@ int whisper_lang_id(const char * lang) { fprintf(stderr, "%s: unknown language '%s'\n", __func__, lang); return -1; } - return g_lang.at(lang).first; } @@ -2858,8 +3002,8 @@ int whisper_lang_auto_detect_with_state( return -1; } - if (seek >= state->mel.n_len) { - fprintf(stderr, "%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len*10); + if (seek >= state->mel.n_len_org) { + fprintf(stderr, "%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len_org*10); return -2; } @@ -2968,8 +3112,8 @@ int whisper_model_n_mels(struct whisper_context * ctx) { return ctx->model.hparams.n_mels; } -int whisper_model_f16(struct whisper_context * ctx) { - return ctx->model.hparams.f16; +int whisper_model_ftype(struct whisper_context * ctx) { + return ctx->model.hparams.ftype; } int whisper_model_type(struct whisper_context * ctx) { @@ -2994,11 +3138,11 @@ const char *whisper_model_type_readable(struct whisper_context * ctx) { } int whisper_n_len_from_state(struct whisper_state * state) { - return state->mel.n_len; + return state->mel.n_len_org; } int whisper_n_len(struct whisper_context * ctx) { - return ctx->state->mel.n_len; + return ctx->state->mel.n_len_org; } int whisper_n_vocab(struct whisper_context * ctx) { @@ -3094,6 +3238,14 @@ void whisper_reset_timings(struct whisper_context * ctx) { } } +static int whisper_has_coreml(void) { +#ifdef WHISPER_USE_COREML + return 1; +#else + return 0; +#endif +} + const char * whisper_print_system_info(void) { static std::string s; @@ -3110,6 +3262,7 @@ const char * whisper_print_system_info(void) { s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | "; s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | "; s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | "; + s += "COREML = " + std::to_string(whisper_has_coreml()) + " | "; return s.c_str(); } @@ -3156,7 +3309,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str /*.max_initial_ts =*/ 1.0f, /*.length_penalty =*/ -1.0f, - /*.temperature_inc =*/ 0.0f, // TODO: temporary disabled until improve performance + /*.temperature_inc =*/ 0.4f, /*.entropy_thold =*/ 2.4f, /*.logprob_thold =*/ -1.0f, /*.no_speech_thold =*/ 0.6f, @@ -3188,13 +3341,13 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str case WHISPER_SAMPLING_GREEDY: { result.greedy = { - /*.best_of =*/ 1, + /*.best_of =*/ 2, // TODO: increase to 5 when we speed-up batch decoding }; } break; case WHISPER_SAMPLING_BEAM_SEARCH: { result.beam_search = { - /*.beam_size =*/ 5, + /*.beam_size =*/ 2, // TODO: increase to 5 when we speed-up batch decoding /*.patience =*/ -1.0f, }; @@ -3215,15 +3368,15 @@ static void whisper_exp_compute_token_level_timestamps( // trim from start (in place) static inline void ltrim(std::string &s) { - s.erase(s.begin(), std::find_if(s.begin(), s.end(), [](unsigned char ch) { - return !std::isspace(ch); + s.erase(s.begin(), std::find_if_not(s.begin(), s.end(), [](unsigned char ch) { + return std::isspace(ch); })); } // trim from end (in place) static inline void rtrim(std::string &s) { - s.erase(std::find_if(s.rbegin(), s.rend(), [](unsigned char ch) { - return !std::isspace(ch); + s.erase(std::find_if_not(s.rbegin(), s.rend(), [](unsigned char ch) { + return std::isspace(ch); }).base(), s.end()); } @@ -3756,7 +3909,7 @@ int whisper_full_with_state( } const int seek_start = params.offset_ms/10; - const int seek_end = seek_start + (params.duration_ms == 0 ? whisper_n_len_from_state(state) : params.duration_ms/10); + const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10; // if length of spectrogram is less than 1s (100 samples), then return // basically don't process anything that is less than 1s @@ -3819,22 +3972,26 @@ int whisper_full_with_state( prompt_past.clear(); } - // initial prompt - if (!params.prompt_tokens && params.initial_prompt) { + // prepare prompt + { std::vector prompt_tokens; - prompt_tokens.resize(1024); - prompt_tokens.resize(whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size())); - params.prompt_tokens = prompt_tokens.data(); - params.prompt_n_tokens = prompt_tokens.size(); - } - // prepend the prompt tokens to the prompt_past - if (params.prompt_tokens && params.prompt_n_tokens > 0) { - // parse tokens from the pointer - for (int i = 0; i < params.prompt_n_tokens; i++) { - prompt_past.push_back(params.prompt_tokens[i]); + // initial prompt + if (!params.prompt_tokens && params.initial_prompt) { + prompt_tokens.resize(1024); + prompt_tokens.resize(whisper_tokenize(ctx, params.initial_prompt, prompt_tokens.data(), prompt_tokens.size())); + params.prompt_tokens = prompt_tokens.data(); + params.prompt_n_tokens = prompt_tokens.size(); + } + + // prepend the prompt tokens to the prompt_past + if (params.prompt_tokens && params.prompt_n_tokens > 0) { + // parse tokens from the pointer + for (int i = 0; i < params.prompt_n_tokens; i++) { + prompt_past.push_back(params.prompt_tokens[i]); + } + std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end()); } - std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end()); } // overwrite audio_ctx, max allowed is hparams.n_audio_ctx @@ -4286,7 +4443,11 @@ int whisper_full_with_state( } // was the decoding successful for the current temperature? - { + // do fallback only if: + // - we are not at the last temperature + // - we are not at the end of the audio (3 sec) + if (it != (int) temperatures.size() - 1 && + seek_end - seek > 10*WHISPER_CHUNK_SIZE) { bool success = true; const auto & decoder = state->decoders[best_decoder_id]; @@ -4727,23 +4888,32 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { // when F16 is used, there is an extra work buffer of size N*N*sizeof(float) std::vector buf(4llu*N_max*N_max*sizeof(float) + 4*256); + // put a bunch of random data in the buffer for (size_t i = 0; i < buf.size(); i++) buf[i] = i; for (int j = 0; j < (int) sizes.size(); j++) { + int n_q4_0 = 0; + int n_q4_1 = 0; int n_fp16 = 0; int n_fp32 = 0; // GFLOPS/s + double s_q4_0 = 0.0; + double s_q4_1 = 0.0; double s_fp16 = 0.0; double s_fp32 = 0.0; const size_t N = sizes[j]; - for (int k = 0; k < 2; ++k) { - const ggml_type wtype = k == 0 ? GGML_TYPE_F16 : GGML_TYPE_F32; + for (int k = 0; k < 4; ++k) { + const ggml_type wtype = + k == 0 ? GGML_TYPE_Q4_0 : + k == 1 ? GGML_TYPE_Q4_1 : + k == 2 ? GGML_TYPE_F16 : + GGML_TYPE_F32; - double & s = k == 0 ? s_fp16 : s_fp32; - int & n = k == 0 ? n_fp16 : n_fp32; + double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_fp16 : s_fp32; + int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_fp16 : n_fp32; struct ggml_init_params gparams = { /*.mem_size =*/ buf.size(), @@ -4787,8 +4957,8 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { s = ((2.0*N*N*N*n)/tsum)*1e-9; } - snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %5zu x %5zu: F16 %8.1f GFLOPS (%3d runs) / F32 %8.1f GFLOPS (%3d runs)\n", - N, N, s_fp16, n_fp16, s_fp32, n_fp32); + snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) / Q4_1 %7.1f GFLOPS (%3d runs) / F16 %7.1f GFLOPS (%3d runs) / F32 %7.1f GFLOPS (%3d runs)\n", + N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_fp16, n_fp16, s_fp32, n_fp32); s += strbuf; } diff --git a/examples/whisper/whisper.h b/examples/whisper/whisper.h index a96c96c9..3d689a4c 100644 --- a/examples/whisper/whisper.h +++ b/examples/whisper/whisper.h @@ -226,7 +226,7 @@ extern "C" { // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first // Returns the top language id or negative on failure // If not null, fills the lang_probs array with the probabilities of all languages - // The array must be whispe_lang_max_id() + 1 in size + // The array must be whisper_lang_max_id() + 1 in size // ref: https://github.com/openai/whisper/blob/main/whisper/decoding.py#L18-L69 WHISPER_API int whisper_lang_auto_detect( struct whisper_context * ctx, @@ -258,7 +258,7 @@ extern "C" { WHISPER_API int whisper_model_n_text_head (struct whisper_context * ctx); WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx); WHISPER_API int whisper_model_n_mels (struct whisper_context * ctx); - WHISPER_API int whisper_model_f16 (struct whisper_context * ctx); + WHISPER_API int whisper_model_ftype (struct whisper_context * ctx); WHISPER_API int whisper_model_type (struct whisper_context * ctx); // Token logits obtained from the last call to whisper_decode() @@ -297,7 +297,7 @@ extern "C" { // Available sampling strategies enum whisper_sampling_strategy { - WHISPER_SAMPLING_GREEDY, // similar to OpenAI's GreefyDecoder + WHISPER_SAMPLING_GREEDY, // similar to OpenAI's GreedyDecoder WHISPER_SAMPLING_BEAM_SEARCH, // similar to OpenAI's BeamSearchDecoder }; diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h index c1c5495c..cbaea3ed 100644 --- a/include/ggml/ggml.h +++ b/include/ggml/ggml.h @@ -232,6 +232,20 @@ extern "C" { GGML_TYPE_COUNT, }; + // model file types + enum ggml_ftype { + GGML_FTYPE_UNKNOWN = -1, + GGML_FTYPE_ALL_F32 = 0, + GGML_FTYPE_MOSTLY_F16 = 1, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_0 = 2, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1 = 3, // except 1d tensors + GGML_FTYPE_MOSTLY_Q4_1_SOME_F16 = 4, // tok_embeddings.weight and output.weight are F16 + GGML_FTYPE_MOSTLY_Q4_2 = 5, // except 1d tensors + GGML_FTYPE_MOSTLY_Q8_0 = 7, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_0 = 8, // except 1d tensors + GGML_FTYPE_MOSTLY_Q5_1 = 9, // except 1d tensors + }; + // available tensor operations: enum ggml_op { GGML_OP_NONE = 0, @@ -385,6 +399,8 @@ extern "C" { GGML_API bool ggml_is_quantized(enum ggml_type type); + GGML_API enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype); + // main GGML_API struct ggml_context * ggml_init(struct ggml_init_params params); diff --git a/scripts/sync-whisper.sh b/scripts/sync-whisper.sh index 34ddb4af..66858761 100755 --- a/scripts/sync-whisper.sh +++ b/scripts/sync-whisper.sh @@ -1,6 +1,12 @@ #!/bin/bash -cp -rpv ../whisper.cpp/ggml.c src/ggml.c -cp -rpv ../whisper.cpp/ggml-cuda.cu src/ggml-cuda.cu -cp -rpv ../whisper.cpp/ggml-cuda.h src/ggml-cuda.h -cp -rpv ../whisper.cpp/ggml.h include/ggml/ggml.h +cp -rpv ../whisper.cpp/ggml.c src/ggml.c +cp -rpv ../whisper.cpp/ggml-cuda.cu src/ggml-cuda.cu +cp -rpv ../whisper.cpp/ggml-cuda.h src/ggml-cuda.h +cp -rpv ../whisper.cpp/ggml.h include/ggml/ggml.h +cp -rpv ../whisper.cpp/examples/common-ggml.h examples/common-ggml.h +cp -rpv ../whisper.cpp/examples/common-ggml.cpp examples/common-ggml.cpp +cp -rpv ../whisper.cpp/whisper.h examples/whisper/whisper.h +cp -rpv ../whisper.cpp/whisper.cpp examples/whisper/whisper.cpp +cp -rpv ../whisper.cpp/examples/main/main.cpp examples/whisper/main.cpp +cp -rpv ../whisper.cpp/examples/quantize/quantize.cpp examples/whisper/quantize.cpp diff --git a/src/ggml.c b/src/ggml.c index 50685f66..17c03ad4 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -330,7 +330,7 @@ static ggml_fp16_t table_exp_f16[1 << 16]; // precomputed f32 table for f16 (256 KB) static float table_f32_f16[1 << 16]; -#if defined(__ARM_NEON) +#if defined(__ARM_NEON) || defined(__wasm_simd128__) #define B1(c,s,n) 0x ## n ## c , 0x ## n ## s #define B2(c,s,n) B1(c,s,n ## c), B1(c,s,n ## s) #define B3(c,s,n) B2(c,s,n ## c), B2(c,s,n ## s) @@ -1087,7 +1087,7 @@ static void quantize_row_q4_0(const float * restrict x, void * restrict vy, int const v128_t v = wasm_f32x4_mul(srcv[l], wasm_f32x4_splat(id)); const v128_t vf = wasm_f32x4_add(v, wasm_f32x4_splat(8.5f)); const v128_t vi = wasm_i32x4_trunc_sat_f32x4(vf); - const v128_t vc = wasm_i32x4_min_u(vi, wasm_i32x4_splat(15)); + const v128_t vc = wasm_i32x4_min(vi, wasm_i32x4_splat(15)); y[i].qs[2*l + 0] = wasm_i32x4_extract_lane(vc, 0) | (wasm_i32x4_extract_lane(vc, 1) << 4); y[i].qs[2*l + 1] = wasm_i32x4_extract_lane(vc, 2) | (wasm_i32x4_extract_lane(vc, 3) << 4); @@ -3180,6 +3180,72 @@ static void ggml_vec_dot_q5_0_q8_0(const int n, float * restrict s, const void * } *s = vaddvq_f32(sumv); +#elif defined(__wasm_simd128__) + v128_t sumv = wasm_f32x4_splat(0.0f); + + uint64_t tmp[4]; + + for (int i = 0; i < nb; ++i) { + const block_q5_0 * restrict x0 = &x[i]; + const block_q8_0 * restrict y0 = &y[i]; + + const v128_t m4b = wasm_i8x16_splat(0x0F); + const v128_t s16b = wasm_i8x16_splat(0x10); + + // extract the 5th bit + uint32_t qh; + memcpy(&qh, x0->qh, sizeof(qh)); + + tmp[0] = table_b2b_u[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_u[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_u[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_u[(qh >> 24) ]; + + const v128_t qhl = wasm_v128_load(tmp + 0); + const v128_t qhh = wasm_v128_load(tmp + 2); + + const v128_t v0 = wasm_v128_load(x0->qs); + + // 4-bit -> 8-bit + const v128_t v0l = wasm_v128_and (v0, m4b); + const v128_t v0h = wasm_u8x16_shr(v0, 4); + + // interleave + const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + + // add high bit and sub 16 + const v128_t v0lf = wasm_i8x16_sub(wasm_v128_or(v0lz, qhl), s16b); + const v128_t v0hf = wasm_i8x16_sub(wasm_v128_or(v0hz, qhh), s16b); + + // load y + const v128_t v1l = wasm_v128_load(y0->qs); + const v128_t v1h = wasm_v128_load(y0->qs + 16); + + // int8x16 -> int16x8 + const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); + const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); + const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); + const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + + const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); + const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); + const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); + const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + + const float x0d = GGML_FP16_TO_FP32(x0->d); + + // dot product + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( + wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), + wasm_i32x4_dot_i16x8(v0lfh, v1lh)), + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d))); + } + + *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3); #elif defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); @@ -3311,6 +3377,77 @@ static void ggml_vec_dot_q5_1_q8_1(const int n, float * restrict s, const void * } *s = vaddvq_f32(sumv) + summs; +#elif defined(__wasm_simd128__) + v128_t sumv = wasm_f32x4_splat(0.0f); + + float summs = 0.0f; + + uint64_t tmp[4]; + + for (int i = 0; i < nb; ++i) { + const block_q5_1 * restrict x0 = &x[i]; + const block_q8_1 * restrict y0 = &y[i]; + + summs += GGML_FP16_TO_FP32(x0->m) * (y0->s0 + y0->s1); + + const v128_t m4b = wasm_i8x16_splat(0x0F); + + // extract the 5th bit + uint32_t qh; + memcpy(&qh, x0->qh, sizeof(qh)); + + tmp[0] = table_b2b_u[(qh >> 0) & 0xFF]; + tmp[1] = table_b2b_u[(qh >> 8) & 0xFF]; + tmp[2] = table_b2b_u[(qh >> 16) & 0xFF]; + tmp[3] = table_b2b_u[(qh >> 24) ]; + + const v128_t qhl = wasm_v128_load(tmp + 0); + const v128_t qhh = wasm_v128_load(tmp + 2); + + const v128_t v0 = wasm_v128_load(x0->qs); + + // 4-bit -> 8-bit + const v128_t v0l = wasm_v128_and (v0, m4b); + const v128_t v0h = wasm_u8x16_shr(v0, 4); + + static bool x = true; + + // interleave + const v128_t v0lz = wasm_v8x16_shuffle(v0l, v0h, 0, 16, 1, 17, 2, 18, 3, 19, 4, 20, 5, 21, 6, 22, 7, 23); + const v128_t v0hz = wasm_v8x16_shuffle(v0l, v0h, 8, 24, 9, 25, 10, 26, 11, 27, 12, 28, 13, 29, 14, 30, 15, 31); + + // add high bit + const v128_t v0lf = wasm_v128_or(v0lz, qhl); + const v128_t v0hf = wasm_v128_or(v0hz, qhh); + + // load y + const v128_t v1l = wasm_v128_load(y0->qs); + const v128_t v1h = wasm_v128_load(y0->qs + 16); + + // int8x16 -> int16x8 + const v128_t v0lfl = wasm_i16x8_extend_low_i8x16 (v0lf); + const v128_t v0lfh = wasm_i16x8_extend_high_i8x16(v0lf); + const v128_t v0hfl = wasm_i16x8_extend_low_i8x16 (v0hf); + const v128_t v0hfh = wasm_i16x8_extend_high_i8x16(v0hf); + + const v128_t v1ll = wasm_i16x8_extend_low_i8x16 (v1l); + const v128_t v1lh = wasm_i16x8_extend_high_i8x16(v1l); + const v128_t v1hl = wasm_i16x8_extend_low_i8x16 (v1h); + const v128_t v1hh = wasm_i16x8_extend_high_i8x16(v1h); + + const float x0d = GGML_FP16_TO_FP32(x0->d); + + // dot product + sumv = wasm_f32x4_add(sumv, wasm_f32x4_mul(wasm_f32x4_convert_i32x4( + wasm_i32x4_add( + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0lfl, v1ll), + wasm_i32x4_dot_i16x8(v0lfh, v1lh)), + wasm_i32x4_add(wasm_i32x4_dot_i16x8(v0hfl, v1hl), + wasm_i32x4_dot_i16x8(v0hfh, v1hh)))), wasm_f32x4_splat(x0d*y0->d))); + } + + *s = wasm_f32x4_extract_lane(sumv, 0) + wasm_f32x4_extract_lane(sumv, 1) + + wasm_f32x4_extract_lane(sumv, 2) + wasm_f32x4_extract_lane(sumv, 3) + summs; #elif defined(__AVX2__) // Initialize accumulator with zeros __m256 acc = _mm256_setzero_ps(); @@ -4057,6 +4194,27 @@ bool ggml_is_quantized(enum ggml_type type) { return GGML_IS_QUANTIZED[type]; } +enum ggml_type ggml_ftype_to_ggml_type(enum ggml_ftype ftype) { + enum ggml_type wtype = GGML_TYPE_COUNT; + + switch (ftype) { + case GGML_FTYPE_ALL_F32: wtype = GGML_TYPE_F32; break; + case GGML_FTYPE_MOSTLY_F16: wtype = GGML_TYPE_F16; break; + case GGML_FTYPE_MOSTLY_Q4_0: wtype = GGML_TYPE_Q4_0; break; + case GGML_FTYPE_MOSTLY_Q4_1: wtype = GGML_TYPE_Q4_1; break; + case GGML_FTYPE_MOSTLY_Q4_2: wtype = GGML_TYPE_Q4_2; break; + case GGML_FTYPE_MOSTLY_Q5_0: wtype = GGML_TYPE_Q5_0; break; + case GGML_FTYPE_MOSTLY_Q5_1: wtype = GGML_TYPE_Q5_1; break; + case GGML_FTYPE_MOSTLY_Q8_0: wtype = GGML_TYPE_Q8_0; break; + case GGML_FTYPE_UNKNOWN: wtype = GGML_TYPE_COUNT; break; + case GGML_FTYPE_MOSTLY_Q4_1_SOME_F16: wtype = GGML_TYPE_COUNT; break; + } + + GGML_ASSERT(wtype != GGML_TYPE_COUNT); + + return wtype; +} + static inline bool ggml_is_transposed(const struct ggml_tensor * tensor) { return tensor->nb[0] > tensor->nb[1]; }