From: Georgi Gerganov Date: Tue, 2 May 2023 18:28:21 +0000 (+0300) Subject: whisper : sync with latest X-Git-Tag: upstream/0.0.1642~1497 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=b9f8a5345320f4a553f3114efba37d28e205c454;p=pkg%2Fggml%2Fsources%2Fggml whisper : sync with latest --- diff --git a/examples/common-ggml.cpp b/examples/common-ggml.cpp index 226f2b14..141c6a2c 100644 --- a/examples/common-ggml.cpp +++ b/examples/common-ggml.cpp @@ -90,7 +90,7 @@ bool ggml_common_quantize_0( } int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; + int32_t ne[4] = { 1, 1, 1, 1 }; for (int i = 0; i < n_dims; ++i) { finp.read (reinterpret_cast(&ne[i]), sizeof(ne[i])); nelements *= ne[i]; @@ -99,7 +99,7 @@ bool ggml_common_quantize_0( std::string name(length, 0); finp.read (&name[0], length); - printf("%64s - [%5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ggml_type_name((ggml_type) ttype)); + printf("%64s - [%5d, %5d, %5d], type = %6s ", name.data(), ne[0], ne[1], ne[2], ggml_type_name((ggml_type) ttype)); bool quantize = false; @@ -204,11 +204,11 @@ bool ggml_common_quantize_0( total_size_new += cur_size; printf("size = %8.2f MB -> %8.2f MB | hist: ", nelements * sizeof(float)/1024.0/1024.0, cur_size/1024.0/1024.0); - for (int i = 0; i < hist_cur.size(); ++i) { + for (int i = 0; i < (int) hist_cur.size(); ++i) { hist_all[i] += hist_cur[i]; } - for (int i = 0; i < hist_cur.size(); ++i) { + for (int i = 0; i < (int) hist_cur.size(); ++i) { printf("%5.3f ", hist_cur[i] / (float)nelements); } printf("\n"); @@ -226,12 +226,12 @@ bool ggml_common_quantize_0( { int64_t sum_all = 0; - for (int i = 0; i < hist_all.size(); ++i) { + for (int i = 0; i < (int) hist_all.size(); ++i) { sum_all += hist_all[i]; } printf("%s: hist: ", __func__); - for (int i = 0; i < hist_all.size(); ++i) { + for (int i = 0; i < (int) hist_all.size(); ++i) { printf("%5.3f ", hist_all[i] / (float)sum_all); } printf("\n"); diff --git a/examples/whisper/main.cpp b/examples/whisper/main.cpp index 3e8c5aaa..c6bf32ed 100644 --- a/examples/whisper/main.cpp +++ b/examples/whisper/main.cpp @@ -66,6 +66,7 @@ struct whisper_params { bool speed_up = false; bool translate = false; + bool detect_language= false; bool diarize = false; bool split_on_word = false; bool no_fallback = false; @@ -141,6 +142,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) { else if (arg == "-pp" || arg == "--print-progress") { params.print_progress = true; } else if (arg == "-nt" || arg == "--no-timestamps") { params.no_timestamps = true; } else if (arg == "-l" || arg == "--language") { params.language = argv[++i]; } + else if (arg == "-dl" || arg == "--detect-language"){ params.detect_language= true; } else if ( arg == "--prompt") { params.prompt = argv[++i]; } else if (arg == "-m" || arg == "--model") { params.model = argv[++i]; } else if (arg == "-f" || arg == "--file") { params.fname_inp.emplace_back(argv[++i]); } @@ -191,6 +193,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para fprintf(stderr, " -pp, --print-progress [%-7s] print progress\n", params.print_progress ? "true" : "false"); fprintf(stderr, " -nt, --no-timestamps [%-7s] do not print timestamps\n", params.no_timestamps ? "false" : "true"); fprintf(stderr, " -l LANG, --language LANG [%-7s] spoken language ('auto' for auto-detect)\n", params.language.c_str()); + fprintf(stderr, " -dl, --detect-language [%-7s] exit after automatically detecting language\n", params.detect_language ? "true" : "false"); fprintf(stderr, " --prompt PROMPT [%-7s] initial prompt\n", params.prompt.c_str()); fprintf(stderr, " -m FNAME, --model FNAME [%-7s] model path\n", params.model.c_str()); fprintf(stderr, " -f FNAME, --file FNAME [%-7s] input WAV file path\n", ""); @@ -739,6 +742,9 @@ int main(int argc, char ** argv) { fprintf(stderr, "%s: WARNING: model is not multilingual, ignoring language and translation options\n", __func__); } } + if (params.detect_language) { + params.language = "auto"; + } fprintf(stderr, "%s: processing '%s' (%d samples, %.1f sec), %d threads, %d processors, lang = %s, task = %s, timestamps = %d ...\n", __func__, fname_inp.c_str(), int(pcmf32.size()), float(pcmf32.size())/WHISPER_SAMPLE_RATE, params.n_threads, params.n_processors, @@ -761,6 +767,7 @@ int main(int argc, char ** argv) { wparams.print_special = params.print_special; wparams.translate = params.translate; wparams.language = params.language.c_str(); + wparams.detect_language = params.detect_language; wparams.n_threads = params.n_threads; wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx; wparams.offset_ms = params.offset_t_ms; diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp index 4f83dcd8..158aa0b9 100644 --- a/examples/whisper/whisper.cpp +++ b/examples/whisper/whisper.cpp @@ -284,11 +284,11 @@ static const std::map> MEM_REQ_MODEL = { }, { GGML_TYPE_Q4_1, { - { MODEL_TINY, 31ull*MB }, - { MODEL_BASE, 57ull*MB }, - { MODEL_SMALL, 181ull*MB }, - { MODEL_MEDIUM, 559ull*MB }, - { MODEL_LARGE, 1122ull*MB }, + { MODEL_TINY, 32ull*MB }, + { MODEL_BASE, 58ull*MB }, + { MODEL_SMALL, 182ull*MB }, + { MODEL_MEDIUM, 562ull*MB }, + { MODEL_LARGE, 1124ull*MB }, }, }, { GGML_TYPE_Q4_2, @@ -300,22 +300,31 @@ static const std::map> MEM_REQ_MODEL = { { MODEL_LARGE, 940ull*MB }, }, }, - { GGML_TYPE_Q5_0, // TODO: fix + { GGML_TYPE_Q5_0, { - { MODEL_TINY, 31ull*MB }, - { MODEL_BASE, 57ull*MB }, - { MODEL_SMALL, 181ull*MB }, - { MODEL_MEDIUM, 559ull*MB }, - { MODEL_LARGE, 1122ull*MB }, + { MODEL_TINY, 30ull*MB }, + { MODEL_BASE, 54ull*MB }, + { MODEL_SMALL, 170ull*MB }, + { MODEL_MEDIUM, 516ull*MB }, + { MODEL_LARGE, 1034ull*MB }, }, }, { GGML_TYPE_Q5_1, { - { MODEL_TINY, 31ull*MB }, - { MODEL_BASE, 57ull*MB }, - { MODEL_SMALL, 181ull*MB }, - { MODEL_MEDIUM, 559ull*MB }, - { MODEL_LARGE, 1122ull*MB }, + { MODEL_TINY, 32ull*MB }, + { MODEL_BASE, 58ull*MB }, + { MODEL_SMALL, 182ull*MB }, + { MODEL_MEDIUM, 562ull*MB }, + { MODEL_LARGE, 1124ull*MB }, + }, + }, + { GGML_TYPE_Q8_0, + { + { MODEL_TINY, 45ull*MB }, + { MODEL_BASE, 84ull*MB }, + { MODEL_SMALL, 268ull*MB }, + { MODEL_MEDIUM, 834ull*MB }, + { MODEL_LARGE, 1674ull*MB }, }, }, }; @@ -1333,7 +1342,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con } int32_t nelements = 1; - int32_t ne[3] = { 1, 1, 1 }; + int32_t ne[4] = { 1, 1, 1, 1 }; for (int i = 0; i < n_dims; ++i) { read_safe(loader, ne[i]); nelements *= ne[i]; @@ -1352,6 +1361,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con auto tensor = model.tensors[name.data()]; if (ggml_nelements(tensor) != nelements) { fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data()); + fprintf(stderr, "%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n", + __func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]); return false; } @@ -3301,6 +3312,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str /*.prompt_n_tokens =*/ 0, /*.language =*/ "en", + /*.detect_language =*/ false, /*.suppress_blank =*/ true, /*.suppress_non_speech_tokens =*/ false, @@ -3887,7 +3899,7 @@ int whisper_full_with_state( } // auto-detect language if not specified - if (params.language == nullptr || strlen(params.language) == 0 || strcmp(params.language, "auto") == 0) { + if (params.language == nullptr || strlen(params.language) == 0 || strcmp(params.language, "auto") == 0 || params.detect_language) { std::vector probs(whisper_lang_max_id() + 1, 0.0f); const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data()); @@ -3899,6 +3911,9 @@ int whisper_full_with_state( params.language = whisper_lang_str(lang_id); fprintf(stderr, "%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]); + if (params.detect_language) { + return 0; + } } if (params.token_timestamps) { @@ -4816,49 +4831,51 @@ WHISPER_API const char * whisper_bench_memcpy_str(int n_threads) { ggml_time_init(); - size_t n = 50; - size_t arr = n_threads > 0 ? 1024 : n_threads; // trick to avoid compiler optimizations + size_t n = 20; + size_t arr = n_threads > 0 ? 1024llu : n_threads; // trick to avoid compiler optimizations - // 1 GB array + // 1GB MB array const size_t size = arr*1024llu*1024llu; - char * src = (char *) malloc(size); - char * dst = (char *) malloc(size); + // single-thread + { + char * src = (char *) malloc(size); + char * dst = (char *) malloc(size); - for (size_t i = 0; i < size; i++) src[i] = i; + for (size_t i = 0; i < size; i++) src[i] = i; - memcpy(dst, src, size); // heat-up + memcpy(dst, src, size); // heat-up - double tsum = 0.0; + double tsum = 0.0; + double sum = 0.0; - for (size_t i = 0; i < n; i++) { - const int64_t t0 = ggml_time_us(); + for (size_t i = 0; i < n; i++) { + const int64_t t0 = ggml_time_us(); - memcpy(dst, src, size); + memcpy(dst, src, size); - const int64_t t1 = ggml_time_us(); + const int64_t t1 = ggml_time_us(); - tsum += (t1 - t0)*1e-6; + tsum += (t1 - t0)*1e-6; - src[0] = rand(); - } + src[rand() % size] = rand() % 256; + } - snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu)); - s += strbuf; + snprintf(strbuf, sizeof(strbuf), "memcpy: %.2f GB/s (1 thread)\n", (double) (n*size)/(tsum*1024llu*1024llu*1024llu)); + s += strbuf; - // needed to prevent the compile from optimizing the memcpy away - { - double sum = 0.0; + // needed to prevent the compiler from optimizing the memcpy away + { + for (size_t i = 0; i < size; i++) sum += dst[i]; - for (size_t i = 0; i < size; i++) sum += dst[i]; + snprintf(strbuf, sizeof(strbuf), "sum: %f\n", sum); + s += strbuf; + } - snprintf(strbuf, sizeof(strbuf), "sum: %s %f\n", sum == -536870910.00 ? "ok" : "error", sum); - s += strbuf; + free(src); + free(dst); } - free(src); - free(dst); - return s.c_str(); } @@ -4894,26 +4911,37 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { for (int j = 0; j < (int) sizes.size(); j++) { int n_q4_0 = 0; int n_q4_1 = 0; + int n_q4_2 = 0; + int n_q5_0 = 0; + int n_q5_1 = 0; + int n_q8_0 = 0; int n_fp16 = 0; int n_fp32 = 0; // GFLOPS/s double s_q4_0 = 0.0; double s_q4_1 = 0.0; + double s_q4_2 = 0.0; + double s_q5_0 = 0.0; + double s_q5_1 = 0.0; + double s_q8_0 = 0.0; double s_fp16 = 0.0; double s_fp32 = 0.0; const size_t N = sizes[j]; - for (int k = 0; k < 4; ++k) { + for (int k = 0; k < 8; ++k) { const ggml_type wtype = k == 0 ? GGML_TYPE_Q4_0 : k == 1 ? GGML_TYPE_Q4_1 : - k == 2 ? GGML_TYPE_F16 : - GGML_TYPE_F32; + k == 2 ? GGML_TYPE_Q4_2 : + k == 3 ? GGML_TYPE_Q5_0 : + k == 4 ? GGML_TYPE_Q5_1 : + k == 5 ? GGML_TYPE_Q8_0 : + k == 6 ? GGML_TYPE_F16 : GGML_TYPE_F32; - double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_fp16 : s_fp32; - int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_fp16 : n_fp32; + double & s = k == 0 ? s_q4_0 : k == 1 ? s_q4_1 : k == 2 ? s_q4_2 : k == 3 ? s_q5_0 : k == 4 ? s_q5_1 : k == 5 ? s_q8_0 : k == 6 ? s_fp16 : /*k == 7*/ s_fp32; + int & n = k == 0 ? n_q4_0 : k == 1 ? n_q4_1 : k == 2 ? n_q4_2 : k == 3 ? n_q5_0 : k == 4 ? n_q5_1 : k == 5 ? n_q8_0 : k == 6 ? n_fp16 : /*k == 7*/ n_fp32; struct ggml_init_params gparams = { /*.mem_size =*/ buf.size(), @@ -4957,8 +4985,19 @@ WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) { s = ((2.0*N*N*N*n)/tsum)*1e-9; } - snprintf(strbuf, sizeof(strbuf), "ggml_mul_mat: %4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) / Q4_1 %7.1f GFLOPS (%3d runs) / F16 %7.1f GFLOPS (%3d runs) / F32 %7.1f GFLOPS (%3d runs)\n", - N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_fp16, n_fp16, s_fp32, n_fp32); + // Q4_0 | Q4_1 | Q4_2 + snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q4_0 %7.1f GFLOPS (%3d runs) | Q4_1 %7.1f GFLOPS (%3d runs) | Q4_2 %7.1f GFLOPS (%3d runs)\n", + N, N, s_q4_0, n_q4_0, s_q4_1, n_q4_1, s_q4_2, n_q4_2); + s += strbuf; + + // Q5_0 | Q5_1 | Q8_0 + snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: Q5_0 %7.1f GFLOPS (%3d runs) | Q5_1 %7.1f GFLOPS (%3d runs) | Q8_0 %7.1f GFLOPS (%3d runs)\n", + N, N, s_q5_0, n_q5_0, s_q5_1, n_q5_1, s_q8_0, n_q8_0); + s += strbuf; + + // F16 | F32 + snprintf(strbuf, sizeof(strbuf), "%4zu x %4zu: F16 %7.1f GFLOPS (%3d runs) | F32 %7.1f GFLOPS (%3d runs)\n", + N, N, s_fp16, n_fp16, s_fp32, n_fp32); s += strbuf; } diff --git a/examples/whisper/whisper.h b/examples/whisper/whisper.h index 3d689a4c..2d5b3eb9 100644 --- a/examples/whisper/whisper.h +++ b/examples/whisper/whisper.h @@ -365,6 +365,7 @@ extern "C" { // for auto-detection, set to nullptr, "" or "auto" const char * language; + bool detect_language; // common decoding parameters: bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89