]> git.djapps.eu Git - pkg/ggml/sources/whisper.cpp/commitdiff
stream : add "max_tokens" parameter
authorGeorgi Gerganov <redacted>
Sun, 20 Nov 2022 18:52:24 +0000 (20:52 +0200)
committerGeorgi Gerganov <redacted>
Sun, 20 Nov 2022 19:22:41 +0000 (21:22 +0200)
Used to limit the number of tokens in a segment.
Useful to battle with word repetition when using partial encoder context

examples/stream/stream.cpp
whisper.cpp
whisper.h

index d2db0b899342ba4dcc5c2ea306e0b249df759dbb..040ba9ebf4b67a3c0c87cd71efe2e505bbac6d35 100644 (file)
@@ -322,6 +322,7 @@ int main(int argc, char ** argv) {
         {
             whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
 
+            wparams.max_tokens           = 32;
             wparams.print_progress       = false;
             wparams.print_special_tokens = params.print_special_tokens;
             wparams.print_realtime       = false;
index 95579ec38fe81aa91ea9a9a0b6bfe0f2d3ce01e5..48f93ebd89e8e0f7efe130c0632181207568944d 100644 (file)
@@ -2402,6 +2402,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
                     /*.thold_pt             =*/ 0.01f,
                     /*.thold_ptsum          =*/ 0.01f,
                     /*.max_len              =*/ 0,
+                    /*.max_tokens           =*/ 0,
 
                     /*.speed_up             =*/ false,
 
@@ -2443,6 +2444,7 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
                     /*.thold_pt             =*/ 0.01f,
                     /*.thold_ptsum          =*/ 0.01f,
                     /*.max_len              =*/ 0,
+                    /*.max_tokens           =*/ 0,
 
                     /*.speed_up             =*/ false,
 
@@ -2685,7 +2687,7 @@ int whisper_full(
                 //}
 
                 // end of text token
-                if (token.id == whisper_token_eot(ctx) || (i > WHISPER_EXPERIMENT_MAX_TOKENS_PER_SEGMENT)) {
+                if (token.id == whisper_token_eot(ctx) || (params.max_tokens > 0 && i > params.max_tokens)) {
                     if (result_len == 0) {
                         if (seek + seek_delta + 100 >= seek_end) {
                             result_len = i + 1;
index ec4b1fb6c53ec60ee6097e5ef21ac7d3c27a6117..0211995dcb8d5922110eaea35f9cda1cbdc9072c 100644 (file)
--- a/whisper.h
+++ b/whisper.h
@@ -25,7 +25,6 @@
 #define WHISPER_CHUNK_SIZE  30
 
 #define WHISPER_EXPERIMENT_AUDIO_CTX 512
-#define WHISPER_EXPERIMENT_MAX_TOKENS_PER_SEGMENT 32
 
 #ifdef __cplusplus
 extern "C" {
@@ -205,6 +204,7 @@ extern "C" {
         float thold_pt;         // timestamp token probability threshold (~0.01)
         float thold_ptsum;      // timestamp token sum probability threshold (~0.01)
         int   max_len;          // max segment length in characters
+        int   max_tokens;       // max tokens per segment (0 = no limit)
 
         // [EXPERIMENTAL] speed-up techniques
         bool speed_up; // speed-up the audio by 2x using Phase Vocoder