return txt[0] == ' ';
}
+// Count UTF-8 characters (not bytes) in a string
+static int utf8_len(const char * str) {
+ int count = 0;
+ while (*str) {
+ // Skip continuation bytes (10xxxxxx)
+ if ((*str & 0xC0) != 0x80) {
+ count++;
+ }
+ str++;
+ }
+ return count;
+}
+
static void whisper_exp_compute_token_level_timestamps_dtw(
struct whisper_context * ctx,
struct whisper_state * state,
}
const auto txt = whisper_token_to_str(&ctx, token.id);
- const int cur = strlen(txt);
+ const int cur = utf8_len(txt); // Use UTF-8 character count instead of byte count
if (acc + cur > max_len && i > 0 && should_split_on_word(txt, split_on_word)) {
state.result_all.back().text = std::move(text);