From: Georgi Gerganov Date: Sun, 14 May 2023 07:06:19 +0000 (+0300) Subject: whisper : sync whisper.cpp minor changes X-Git-Tag: upstream/0.0.1642~1479 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=fa8665fbe7894e236423b7f67f64252b84d2cccf;p=pkg%2Fggml%2Fsources%2Fggml whisper : sync whisper.cpp minor changes --- diff --git a/examples/common.cpp b/examples/common.cpp index 75c443e2..ba0e9522 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -62,7 +62,7 @@ bool gpt_params_parse(int argc, char ** argv, gpt_params & params) { return true; } -void gpt_print_usage(int argc, char ** argv, const gpt_params & params) { +void gpt_print_usage(int /*argc*/, char ** argv, const gpt_params & params) { fprintf(stderr, "usage: %s [options]\n", argv[0]); fprintf(stderr, "\n"); fprintf(stderr, "options:\n"); @@ -208,7 +208,7 @@ std::map json_parse(const std::string & fname) { return result; } -void gpt_vocab::add_special_token(const std::string &token) { +void gpt_vocab::add_special_token(const std::string & token) { special_tokens.push_back(token); } @@ -216,7 +216,6 @@ void gpt_vocab::add_special_token(const std::string &token) { std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text) { std::vector words; - // first split the text into words { std::string str = text; @@ -225,7 +224,7 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri // Generate the subpattern from the special_tokens vector if it's not empty if (!vocab.special_tokens.empty()) { std::string special_tokens_subpattern; - for (const auto &token : vocab.special_tokens) { + for (const auto & token : vocab.special_tokens) { if (!special_tokens_subpattern.empty()) { special_tokens_subpattern += "|"; } @@ -515,3 +514,27 @@ bool vad_simple(std::vector & pcmf32, int sample_rate, int last_ms, float return true; } + +float similarity(const std::string & s0, const std::string & s1) { + const size_t len0 = s0.size() + 1; + const size_t len1 = s1.size() + 1; + + std::vector col(len1, 0); + std::vector prevCol(len1, 0); + + for (size_t i = 0; i < len1; i++) { + prevCol[i] = i; + } + + for (size_t i = 0; i < len0; i++) { + col[0] = i; + for (size_t j = 1; j < len1; j++) { + col[j] = std::min(std::min(1 + col[j - 1], 1 + prevCol[j]), prevCol[j - 1] + (i > 0 && s0[i - 1] == s1[j - 1] ? 0 : 1)); + } + col.swap(prevCol); + } + + const float dist = prevCol[len1 - 1]; + + return 1.0f - (dist / std::max(s0.size(), s1.size())); +} diff --git a/examples/common.h b/examples/common.h index bd66f09a..29d0792a 100644 --- a/examples/common.h +++ b/examples/common.h @@ -55,7 +55,7 @@ struct gpt_vocab { std::map id_to_token; std::vector special_tokens; - void add_special_token(const std::string &token); + void add_special_token(const std::string & token); }; // poor-man's JSON parsing @@ -121,3 +121,5 @@ bool vad_simple( float freq_thold, bool verbose); +// compute similarity between two strings using Levenshtein distance +float similarity(const std::string & s0, const std::string & s1); diff --git a/scripts/sync-whisper.sh b/scripts/sync-whisper.sh index 1878c9e6..df695138 100755 --- a/scripts/sync-whisper.sh +++ b/scripts/sync-whisper.sh @@ -6,6 +6,8 @@ cp -rpv ../whisper.cpp/ggml-cuda.cu src/ggml-cuda.cu cp -rpv ../whisper.cpp/ggml-opencl.h src/ggml-opencl.h cp -rpv ../whisper.cpp/ggml-opencl.c src/ggml-opencl.c cp -rpv ../whisper.cpp/ggml.h include/ggml/ggml.h +cp -rpv ../whisper.cpp/examples/common.h examples/common.h +cp -rpv ../whisper.cpp/examples/common.cpp examples/common.cpp cp -rpv ../whisper.cpp/examples/common-ggml.h examples/common-ggml.h cp -rpv ../whisper.cpp/examples/common-ggml.cpp examples/common-ggml.cpp cp -rpv ../whisper.cpp/whisper.h examples/whisper/whisper.h