From: Yshtola <redacted>
Date: Fri, 16 Jan 2026 12:16:05 +0000 (+0800)
Subject: whisper : Fix UTF-8 character boundary issue in segment wrapping (max_len) (#3592)
X-Git-Tag: upstream/1.8.3+155~154
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=f53dc74843e97f19f94a79241357f74ad5b691a6;p=pkg%2Fggml%2Fsources%2Fwhisper.cpp

whisper : Fix UTF-8 character boundary issue in segment wrapping (max_len) (#3592)

The current implementation in `whisper_wrap_segment()` uses `strlen()` to count bytes, not UTF-8 characters. When splitting segments at `max_len`, this can break multi-byte UTF-8 characters, resulting in invalid sequences displayed as `�` (U+FFFD replacement character).
---

diff --git a/src/whisper.cpp b/src/whisper.cpp
index 5b6e4b4b..796bccfb 100644
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -6026,6 +6026,19 @@ static inline bool should_split_on_word(const char * txt, bool split_on_word) {
     return txt[0] == ' ';
 }
 
+// Count UTF-8 characters (not bytes) in a string
+static int utf8_len(const char * str) {
+    int count = 0;
+    while (*str) {
+        // Skip continuation bytes (10xxxxxx)
+        if ((*str & 0xC0) != 0x80) {
+            count++;
+        }
+        str++;
+    }
+    return count;
+}
+
 static void whisper_exp_compute_token_level_timestamps_dtw(
             struct whisper_context * ctx,
               struct whisper_state * state,
@@ -6054,7 +6067,7 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
         }
 
         const auto txt = whisper_token_to_str(&ctx, token.id);
-        const int cur = strlen(txt);
+        const int cur = utf8_len(txt);  // Use UTF-8 character count instead of byte count
 
         if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) {
             state.result_all.back().text = std::move(text);