From: Georgi Gerganov Date: Sun, 25 Jun 2023 20:51:01 +0000 (+0300) Subject: whisper : `split_on_word` no longer trims (#1046) X-Git-Tag: upstream/1.7.4~1396 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=72deb41eb26300f71c50febe29db8ffcce09256c;p=pkg%2Fggml%2Fsources%2Fwhisper.cpp whisper : `split_on_word` no longer trims (#1046) --- diff --git a/whisper.cpp b/whisper.cpp index 74cfd7b2..5f3888c7 100644 --- a/whisper.cpp +++ b/whisper.cpp @@ -3401,26 +3401,6 @@ static void whisper_exp_compute_token_level_timestamps( float thold_pt, float thold_ptsum); -// trim from start (in place) -static inline void ltrim(std::string &s) { - s.erase(s.begin(), std::find_if_not(s.begin(), s.end(), [](unsigned char ch) { - return std::isspace(ch); - })); -} - -// trim from end (in place) -static inline void rtrim(std::string &s) { - s.erase(std::find_if_not(s.rbegin(), s.rend(), [](unsigned char ch) { - return std::isspace(ch); - }).base(), s.end()); -} - -// trim from both ends (in place) -static inline void trim(std::string &s) { - rtrim(s); - ltrim(s); -} - static inline bool should_split_on_word(const char * txt, bool split_on_word) { if (!split_on_word) return true; @@ -3447,11 +3427,6 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta const int cur = strlen(txt); if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) { - // split here - if (split_on_word) { - trim(text); - } - state.result_all.back().text = std::move(text); state.result_all.back().t1 = token.t0; state.result_all.back().tokens.resize(i); @@ -3479,9 +3454,6 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta } } - if (split_on_word) { - trim(text); - } state.result_all.back().text = std::move(text); return res;