]> git.djapps.eu Git - pkg/ggml/sources/whisper.cpp/commitdiff
Adding sanitizer tests
authorGeorgi Gerganov <redacted>
Sat, 8 Oct 2022 07:56:59 +0000 (10:56 +0300)
committerGeorgi Gerganov <redacted>
Sat, 8 Oct 2022 08:43:42 +0000 (11:43 +0300)
.github/workflows/build.yml
.gitignore
tests/CMakeLists.txt
whisper.cpp
whisper.h

index effa8db3226b058c4c587e8165eacd18e2a015ce..f1b63272d089397394e76e7d3e7de364879d8799 100644 (file)
@@ -61,7 +61,7 @@ jobs:
             - name: Build
               run: |
                 make
-                ctest --output-on-failure
+                ctest -L gh --output-on-failure
 
     ubuntu-latest-clang:
         runs-on: ubuntu-latest
@@ -87,7 +87,7 @@ jobs:
             - name: Build
               run: |
                 make
-                ctest --output-on-failure
+                ctest -L gh --output-on-failure
 
     ubuntu-latest-gcc-sanitized:
         runs-on: ubuntu-latest
@@ -112,4 +112,4 @@ jobs:
             - name: Build
               run: |
                 make
-                ctest --output-on-failure
+                ctest -L gh --output-on-failure
index 23b28c178d8d24caf473c02a8a66dfe68058a910..860e0d915e87e5d1fc1441c9c76ff84f8d75d708 100644 (file)
@@ -4,3 +4,4 @@ stream
 *.o
 .cache
 build/
+compile_commands.json
index e69de29bb2d1d6434b8b29ae775ad8c2e48c5391..b51fbbb6b56f9649363c80622be512cda03be93b 100644 (file)
@@ -0,0 +1,62 @@
+set(TEST_TARGET test-main-tiny)
+add_test(NAME ${TEST_TARGET}
+    COMMAND $<TARGET_FILE:main>
+    -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-tiny.bin
+    -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
+set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "tiny;gh")
+
+set(TEST_TARGET test-main-tiny.en)
+add_test(NAME ${TEST_TARGET}
+    COMMAND $<TARGET_FILE:main>
+    -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-tiny.en.bin
+    -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
+set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "tiny;en;gh")
+
+set(TEST_TARGET test-main-base)
+add_test(NAME ${TEST_TARGET}
+    COMMAND $<TARGET_FILE:main>
+    -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-base.bin
+    -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
+set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "base")
+
+set(TEST_TARGET test-main-base.en)
+add_test(NAME ${TEST_TARGET}
+    COMMAND $<TARGET_FILE:main>
+    -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-base.en.bin
+    -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
+set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "base;en")
+
+set(TEST_TARGET test-main-small)
+add_test(NAME ${TEST_TARGET}
+    COMMAND $<TARGET_FILE:main>
+    -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-small.bin
+    -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
+set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "small")
+
+set(TEST_TARGET test-main-small.en)
+add_test(NAME ${TEST_TARGET}
+    COMMAND $<TARGET_FILE:main>
+    -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-small.en.bin
+    -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
+set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "small;en")
+
+set(TEST_TARGET test-main-medium)
+add_test(NAME ${TEST_TARGET}
+    COMMAND $<TARGET_FILE:main>
+    -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-medium.bin
+    -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
+set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "medium")
+
+set(TEST_TARGET test-main-medium.en)
+add_test(NAME ${TEST_TARGET}
+    COMMAND $<TARGET_FILE:main>
+    -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-medium.en.bin
+    -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
+set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "medium;en")
+
+set(TEST_TARGET test-main-large)
+add_test(NAME ${TEST_TARGET}
+    COMMAND $<TARGET_FILE:main>
+    -m ${PROJECT_SOURCE_DIR}/models/for-tests-ggml-large.bin
+    -f ${PROJECT_SOURCE_DIR}/samples/jfk.wav)
+set_tests_properties(${TEST_TARGET} PROPERTIES LABELS "large")
index cb15b986844db941755579e9da8c473e9edbc06e..cdf76beb19f7016ef24df97386b1b8630993b4fa 100644 (file)
@@ -950,6 +950,7 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
 
     // load weights
     {
+        int n_loaded = 0;
         size_t total_size = 0;
 
         while (true) {
@@ -1004,9 +1005,17 @@ bool whisper_model_load(const std::string & fname, whisper_context & wctx) {
 
             //printf("%24s - [%5d, %5d], type = %6s, %6.2f MB\n", name.data(), ne[0], ne[1], ftype == 0 ? "float" : "f16", ggml_nbytes(tensor)/1024.0/1024.0);
             total_size += ggml_nbytes(tensor);
+            n_loaded++;
         }
 
         printf("%s: model size  = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+
+        if (n_loaded == 0) {
+            printf("%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
+        } else if (n_loaded != model.tensors.size()) {
+            fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), n_loaded);
+            return false;
+        }
     }
 
     fin.close();
@@ -1772,8 +1781,6 @@ bool whisper_decode(
 }
 
 // the most basic sampling scheme - select the top token
-// TODO: beam search
-// TODO: temperature
 whisper_vocab::id whisper_sample_best(
         const whisper_vocab & vocab,
         const float * probs, bool need_timestamp) {
index 79df0e04a26376864ca246bf9ad7b03de5b888db..3f8ddc978e16e91af76af2bb111a7ff08ec9ac3f 100644 (file)
--- a/whisper.h
+++ b/whisper.h
@@ -71,11 +71,12 @@ extern "C" {
     // return the id of the specified language, returns -1 if not found
     WHISPER_API int whisper_lang_id(const char * lang);
 
-    WHISPER_API int     whisper_n_len          (struct whisper_context * ctx); // mel length
-    WHISPER_API int     whisper_n_vocab        (struct whisper_context * ctx);
-    WHISPER_API int     whisper_n_text_ctx     (struct whisper_context * ctx);
-    WHISPER_API int     whisper_is_multilingual(struct whisper_context * ctx);
-    WHISPER_API float * whisper_get_probs      (struct whisper_context * ctx);
+    WHISPER_API int whisper_n_len          (struct whisper_context * ctx); // mel length
+    WHISPER_API int whisper_n_vocab        (struct whisper_context * ctx);
+    WHISPER_API int whisper_n_text_ctx     (struct whisper_context * ctx);
+    WHISPER_API int whisper_is_multilingual(struct whisper_context * ctx);
+
+    WHISPER_API float * whisper_get_probs(struct whisper_context * ctx);
 
     WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);