whisper : Fix UTF-8 character boundary issue in segment wrapping (max_len) (#3592)

author Yshtola <redacted>

Fri, 16 Jan 2026 12:16:05 +0000 (20:16 +0800)

committer GitHub <redacted>

Fri, 16 Jan 2026 12:16:05 +0000 (14:16 +0200)
author Yshtola <redacted>
Fri, 16 Jan 2026 12:16:05 +0000 (20:16 +0800)
committer GitHub <redacted>
Fri, 16 Jan 2026 12:16:05 +0000 (14:16 +0200)
diff --git a/src/whisper.cpp b/src/whisper.cpp

index 5b6e4b4be486de7759c2154afb4631f679f250b0..796bccfb45d560624b272f4bf4369a6a04e4dc66 100644 (file)
--- a/src/whisper.cpp
+++ b/src/whisper.cpp
@@ -6026,6 +6026,19 @@ static inline bool should_split_on_word(const char * txt, bool split_on_word) {
      return txt[0] == ' ';
  }
  
+// Count UTF-8 characters (not bytes) in a string
+static int utf8_len(const char * str) {
+    int count = 0;
+    while (*str) {
+        // Skip continuation bytes (10xxxxxx)
+        if ((*str & 0xC0) != 0x80) {
+            count++;
+        }
+        str++;
+    }
+    return count;
+}
+
  static void whisper_exp_compute_token_level_timestamps_dtw(
              struct whisper_context * ctx,
                struct whisper_state * state,
@@ -6054,7 +6067,7 @@ static int whisper_wrap_segment(struct whisper_context & ctx, struct whisper_sta
          }
  
          const auto txt = whisper_token_to_str(&ctx, token.id);
-        const int cur = strlen(txt);
+        const int cur = utf8_len(txt);  // Use UTF-8 character count instead of byte count
  
          if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) {
              state.result_all.back().text = std::move(text);
author	Yshtola <redacted>
	Fri, 16 Jan 2026 12:16:05 +0000 (20:16 +0800)
committer	GitHub <redacted>
	Fri, 16 Jan 2026 12:16:05 +0000 (14:16 +0200)