]> git.djapps.eu Git - pkg/ggml/sources/whisper.cpp/commitdiff
whisper : add single-timestamp logic (#2629)
authorKarthick <redacted>
Tue, 17 Dec 2024 17:07:08 +0000 (22:37 +0530)
committerGitHub <redacted>
Tue, 17 Dec 2024 17:07:08 +0000 (19:07 +0200)
* Fix hallucinations during silence

When the predicted tokens end with a single timestamp the the entire 30 segment should be considered as done, to avoid hallucinations for the remaining part of segment.
This behaviour is on par with openai's whisper. Refer to logic related to `single_timestamp_ending` in https://github.com/openai/whisper/blob/main/whisper/transcribe.py

* Accept review comments related to formatting.

Co-authored-by: Georgi Gerganov <redacted>
---------

Co-authored-by: Georgi Gerganov <redacted>
src/whisper.cpp

index ddeecc5e098a05ea55f1d3ebfe33135e5ac65f5b..810a8d267aba2520673d18f0e21e06842ba047f7 100644 (file)
@@ -6060,7 +6060,7 @@ int whisper_full_with_state(
         {
             const auto & best_decoder = state->decoders[best_decoder_id];
 
-            const auto seek_delta = best_decoder.seek_delta;
+            auto seek_delta = best_decoder.seek_delta;
             const auto result_len = best_decoder.sequence.result_len;
 
             const auto & tokens_cur = best_decoder.sequence.tokens;
@@ -6201,6 +6201,15 @@ int whisper_full_with_state(
                 }
             }
 
+            // ref: https://github.com/ggerganov/whisper.cpp/pull/2629
+            const bool single_timestamp_ending = tokens_cur.size() > 1 &&
+                tokens_cur[tokens_cur.size() - 2].id < whisper_token_beg(ctx) &&
+                tokens_cur[tokens_cur.size() - 1].id > whisper_token_beg(ctx);
+            if (single_timestamp_ending) {
+                WHISPER_LOG_DEBUG("single timestamp ending - skip entire chunk\n");
+                seek_delta = std::min(seek_end - seek, WHISPER_CHUNK_SIZE * 100);
+            }
+
             // update audio window
             seek += seek_delta;