From: Yshtola Date: Fri, 16 Jan 2026 12:16:05 +0000 (+0800) Subject: whisper : Fix UTF-8 character boundary issue in segment wrapping (max_len) (#3592) X-Git-Tag: upstream/1.8.3+155~154 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=f53dc74843e97f19f94a79241357f74ad5b691a6;p=pkg%2Fggml%2Fsources%2Fwhisper.cpp whisper : Fix UTF-8 character boundary issue in segment wrapping (max_len) (#3592) The current implementation in `whisper_wrap_segment()` uses `strlen()` to count bytes, not UTF-8 characters. When splitting segments at `max_len`, this can break multi-byte UTF-8 characters, resulting in invalid sequences displayed as `�` (U+FFFD replacement character). --- diff --git a/src/whisper.cpp b/src/whisper.cpp index 5b6e4b4b..796bccfb 100644 --- a/src/whisper.cpp +++ b/src/whisper.cpp @@ -6026,6 +6026,19 @@ static inline bool should_split_on_word(const char * txt, bool split_on_word) { return txt[0] == ' '; } +// Count UTF-8 characters (not bytes) in a string +static int utf8_len(const char * str) { + int count = 0; + while (*str) { + // Skip continuation bytes (10xxxxxx) + if ((*str & 0xC0) != 0x80) { + count++; + } + str++; + } + return count; +} + static void whisper_exp_compute_token_level_timestamps_dtw( struct whisper_context * ctx, struct whisper_state * state, @@ -6054,7 +6067,7 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta } const auto txt = whisper_token_to_str(&ctx, token.id); - const int cur = strlen(txt); + const int cur = utf8_len(txt); // Use UTF-8 character count instead of byte count if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) { state.result_all.back().text = std::move(text);