From: Georgi Gerganov <redacted>
Date: Sun, 14 May 2023 07:06:19 +0000 (+0300)
Subject: whisper : sync whisper.cpp minor changes
X-Git-Tag: upstream/0.0.1642~1479
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=fa8665fbe7894e236423b7f67f64252b84d2cccf;p=pkg%2Fggml%2Fsources%2Fggml

whisper : sync whisper.cpp minor changes
---

diff --git a/examples/common.cpp b/examples/common.cpp
index 75c443e2..ba0e9522 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -62,7 +62,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
     return true;
 }
 
-void gpt_print_usage(int argc, char ** argv, const gpt_params & params) {
+void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) {
     fprintf(stderr, "usage: %s [options]\n", argv[0]);
     fprintf(stderr, "\n");
     fprintf(stderr, "options:\n");
@@ -208,7 +208,7 @@ std::map<std::string, int32_t> json_parse(const std::string & fname) {
     return result;
 }
 
-void gpt_vocab::add_special_token(const std::string &token) {
+void gpt_vocab::add_special_token(const std::string & token) {
     special_tokens.push_back(token);
 }
 
@@ -216,7 +216,6 @@ void gpt_vocab::add_special_token(const std::string &token) {
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
     std::vector<std::string> words;
 
- 
     // first split the text into words
     {
         std::string str = text;
@@ -225,7 +224,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
         // Generate the subpattern from the special_tokens vector if it's not empty
         if (!vocab.special_tokens.empty()) {
             std::string special_tokens_subpattern;
-            for (const auto &token : vocab.special_tokens) {
+            for (const auto & token : vocab.special_tokens) {
                 if (!special_tokens_subpattern.empty()) {
                     special_tokens_subpattern += "|";
                 }
@@ -515,3 +514,27 @@ bool vad_simple(std::vector<float> & pcmf32, int sample_rate, int last_ms, float
 
     return true;
 }
+
+float similarity(const std::string & s0, const std::string & s1) {
+    const size_t len0 = s0.size() + 1;
+    const size_t len1 = s1.size() + 1;
+
+    std::vector<int> col(len1, 0);
+    std::vector<int> prevCol(len1, 0);
+
+    for (size_t i = 0; i < len1; i++) {
+        prevCol[i] = i;
+    }
+
+    for (size_t i = 0; i < len0; i++) {
+        col[0] = i;
+        for (size_t j = 1; j < len1; j++) {
+            col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1));
+        }
+        col.swap(prevCol);
+    }
+
+    const float dist = prevCol[len1 - 1];
+
+    return 1.0f - (dist / std::max(s0.size(), s1.size()));
+}
diff --git a/examples/common.h b/examples/common.h
index bd66f09a..29d0792a 100644
--- a/examples/common.h
+++ b/examples/common.h
@@ -55,7 +55,7 @@ struct gpt_vocab {
     std::map<id, token> id_to_token;
     std::vector<std::string> special_tokens;
 
-    void add_special_token(const std::string &token);
+    void add_special_token(const std::string & token);
 };
 
 // poor-man's JSON parsing
@@ -121,3 +121,5 @@ bool vad_simple(
         float freq_thold,
         bool  verbose);
 
+// compute similarity between two strings using Levenshtein distance
+float similarity(const std::string & s0, const std::string & s1);
diff --git a/scripts/sync-whisper.sh b/scripts/sync-whisper.sh
index 1878c9e6..df695138 100755
--- a/scripts/sync-whisper.sh
+++ b/scripts/sync-whisper.sh
@@ -6,6 +6,8 @@ cp -rpv ../whisper.cpp/ggml-cuda.cu                   src/ggml-cuda.cu
 cp -rpv ../whisper.cpp/ggml-opencl.h                  src/ggml-opencl.h
 cp -rpv ../whisper.cpp/ggml-opencl.c                  src/ggml-opencl.c
 cp -rpv ../whisper.cpp/ggml.h                         include/ggml/ggml.h
+cp -rpv ../whisper.cpp/examples/common.h              examples/common.h
+cp -rpv ../whisper.cpp/examples/common.cpp            examples/common.cpp
 cp -rpv ../whisper.cpp/examples/common-ggml.h         examples/common-ggml.h
 cp -rpv ../whisper.cpp/examples/common-ggml.cpp       examples/common-ggml.cpp
 cp -rpv ../whisper.cpp/whisper.h                      examples/whisper/whisper.h