whisper : remove `speed_up` and `phase_vocoder*` functions (#2198)

author Borislav Stanimirov <redacted>

Fri, 31 May 2024 08:37:29 +0000 (11:37 +0300)

committer GitHub <redacted>

Fri, 31 May 2024 08:37:29 +0000 (11:37 +0300)
author Borislav Stanimirov <redacted>
Fri, 31 May 2024 08:37:29 +0000 (11:37 +0300)
committer GitHub <redacted>
Fri, 31 May 2024 08:37:29 +0000 (11:37 +0300)
diff --git a/bindings/go/examples/go-whisper/flags.go b/bindings/go/examples/go-whisper/flags.go

index ea204455c80a35d4d6f7fafc2f2acfed8486fdff..766c92f1827e3af7f4cb514920ae347baacb530b 100644 (file)
--- a/bindings/go/examples/go-whisper/flags.go
+++ b/bindings/go/examples/go-whisper/flags.go
@@ -68,10 +68,6 @@ func (flags *Flags) GetOut() string {
         return strings.ToLower(flags.Lookup("out").Value.String())
  }
  
-func (flags *Flags) IsSpeedup() bool {
-       return flags.Lookup("speedup").Value.String() == "true"
-}
-
  func (flags *Flags) IsTokens() bool {
         return flags.Lookup("tokens").Value.String() == "true"
  }
@@ -111,10 +107,6 @@ func (flags *Flags) SetParams(context whisper.Context) error {
                 fmt.Fprintf(flags.Output(), "Setting duration to %v\n", duration)
                 context.SetDuration(duration)
         }
-       if flags.IsSpeedup() {
-               fmt.Fprintf(flags.Output(), "Setting speedup to true\n")
-               context.SetSpeedup(true)
-       }
         if threads := flags.GetThreads(); threads != 0 {
                 fmt.Fprintf(flags.Output(), "Setting threads to %d\n", threads)
                 context.SetThreads(threads)
@@ -146,7 +138,6 @@ func registerFlags(flag *Flags) {
         flag.Duration("offset", 0, "Time offset")
         flag.Duration("duration", 0, "Duration of audio to process")
         flag.Uint("threads", 0, "Number of threads to use")
-       flag.Bool("speedup", false, "Enable speedup")
         flag.Uint("max-len", 0, "Maximum segment length in characters")
         flag.Uint("max-tokens", 0, "Maximum tokens per segment")
         flag.Float64("word-thold", 0, "Maximum segment score")
diff --git a/bindings/go/params.go b/bindings/go/params.go

index 5931bb0b199fa789a21415a8bef5cb09c779720f..4b4da032d62e1fee2f618fc7eccf277660de6458 100644 (file)
--- a/bindings/go/params.go
+++ b/bindings/go/params.go
@@ -47,10 +47,6 @@ func (p *Params) SetPrintTimestamps(v bool) {
         p.print_timestamps = toBool(v)
  }
  
-func (p *Params) SetSpeedup(v bool) {
-       p.speed_up = toBool(v)
-}
-
  // Set language id
  func (p *Params) SetLanguage(lang int) error {
         if lang == -1 {
@@ -177,9 +173,6 @@ func (p *Params) String() string {
         if p.token_timestamps {
                 str += " token_timestamps"
         }
-       if p.speed_up {
-               str += " speed_up"
-       }
  
         return str + ">"
  }
diff --git a/bindings/go/pkg/whisper/context.go b/bindings/go/pkg/whisper/context.go

index 0863ef6bb164a1d463aa1b04ed1a445d794f10ea..ead92648f3e79952b7de458605e40881e78066df 100644 (file)
--- a/bindings/go/pkg/whisper/context.go
+++ b/bindings/go/pkg/whisper/context.go
@@ -76,11 +76,6 @@ func (context *context) SetTranslate(v bool) {
         context.params.SetTranslate(v)
  }
  
-// Set speedup flag
-func (context *context) SetSpeedup(v bool) {
-       context.params.SetSpeedup(v)
-}
-
  func (context *context) SetSplitOnWord(v bool) {
         context.params.SetSplitOnWord(v)
  }
diff --git a/bindings/go/pkg/whisper/interface.go b/bindings/go/pkg/whisper/interface.go

index 4339e16f847bd3921819479a6e84f8926ea765bd..b430e7ce8538f4072eeb3ec79b190dcd4e73456b 100644 (file)
--- a/bindings/go/pkg/whisper/interface.go
+++ b/bindings/go/pkg/whisper/interface.go
@@ -41,7 +41,6 @@ type Context interface {
         SetOffset(time.Duration)        // Set offset
         SetDuration(time.Duration)      // Set duration
         SetThreads(uint)                // Set number of threads to use
-       SetSpeedup(bool)                // Set speedup flag
         SetSplitOnWord(bool)            // Set split on word flag
         SetTokenThreshold(float32)      // Set timestamp token probability threshold
         SetTokenSumThreshold(float32)   // Set timestamp token sum probability threshold
diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java

index 56a37380136999952bea8853ccb59ecdc28ffd6a..1a73cee1181c40c289b2edfd8076cc1931293f8b 100644 (file)
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java
@@ -20,7 +20,7 @@ public interface WhisperCppJnaLibrary extends Library {
       * @return Whisper context on success, null on failure\r
       */\r
      Pointer whisper_init_from_file(String path_model);\r
-    \r
+\r
      /**\r
       * Provides default params which can be used with `whisper_init_from_file_with_params()` etc.\r
       * Because this function allocates memory for the params, the caller must call either:\r
@@ -304,14 +304,6 @@ public interface WhisperCppJnaLibrary extends Library {
      /** Language id associated with the provided state */\r
      int whisper_full_lang_id_from_state(Pointer state);\r
  \r
-    /**\r
-     * Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.\r
-     * The resulting spectrogram is stored inside the default state of the provided whisper context.\r
-     * @return 0 on success\r
-     */\r
-    int whisper_pcm_to_mel_phase_vocoder(Pointer ctx, final float[] samples, int n_samples, int n_threads);\r
-\r
-    int whisper_pcm_to_mel_phase_vocoder_with_state(Pointer ctx, Pointer state, final float[] samples, int n_samples, int n_threads);\r
  \r
      /** Get the start time of the specified segment. */\r
      long whisper_full_get_segment_t0(Pointer ctx, int i_segment);\r
diff --git a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java

index 60d8334b9350355c718beb2c056a5fdaf2980333..90d8c15767c9763bc243f87c12a7ae21b14a8043 100644 (file)
--- a/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
+++ b/bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java
@@ -129,14 +129,6 @@ public class WhisperFullParams extends Structure {
      /** Maximum tokens per segment (0, default = no limit) */\r
      public int max_tokens;\r
  \r
-    /** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */\r
-    public CBool speed_up;\r
-\r
-    /** Flag to speed up the audio by 2x using Phase Vocoder. (default = false) */\r
-    public void speedUp(boolean enable) {\r
-        speed_up = enable ? CBool.TRUE : CBool.FALSE;\r
-    }\r
-\r
      /** Overwrite the audio context size (0 = use default). */\r
      public int audio_ctx;\r
  \r
@@ -321,7 +313,7 @@ public class WhisperFullParams extends Structure {
          return Arrays.asList("strategy", "n_threads", "n_max_text_ctx", "offset_ms", "duration_ms", "translate",\r
                  "no_context", "single_segment", "no_timestamps",\r
                  "print_special", "print_progress", "print_realtime", "print_timestamps",  "token_timestamps",\r
-                "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "speed_up", "audio_ctx",\r
+                "thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx",\r
                  "tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",\r
                  "suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",\r
                  "temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",\r
diff --git a/bindings/ruby/ext/ruby_whisper.cpp b/bindings/ruby/ext/ruby_whisper.cpp

index 86af9391e2c31e3695e08c649021909279432a67..9d9334539b89219e76360857255d5ef7a8794d60 100644 (file)
--- a/bindings/ruby/ext/ruby_whisper.cpp
+++ b/bindings/ruby/ext/ruby_whisper.cpp
@@ -311,12 +311,6 @@ static VALUE ruby_whisper_params_get_split_on_word(VALUE self) {
  static VALUE ruby_whisper_params_set_split_on_word(VALUE self, VALUE value) {
    BOOL_PARAMS_SETTER(self, split_on_word, value)
  }
-static VALUE ruby_whisper_params_get_speed_up(VALUE self) {
-  BOOL_PARAMS_GETTER(self, speed_up)
-}
-static VALUE ruby_whisper_params_set_speed_up(VALUE self, VALUE value) {
-  BOOL_PARAMS_SETTER(self, speed_up, value)
-}
  static VALUE ruby_whisper_params_get_diarize(VALUE self) {
    ruby_whisper_params *rwp;
    Data_Get_Struct(self, ruby_whisper_params, rwp);
@@ -408,8 +402,6 @@ void Init_whisper() {
    rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
    rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
    rb_define_method(cParams, "split_on_word=", ruby_whisper_params_set_split_on_word, 1);
-  rb_define_method(cParams, "speed_up", ruby_whisper_params_get_speed_up, 0);
-  rb_define_method(cParams, "speed_up=", ruby_whisper_params_set_speed_up, 1);
    rb_define_method(cParams, "diarize", ruby_whisper_params_get_diarize, 0);
    rb_define_method(cParams, "diarize=", ruby_whisper_params_set_diarize, 1);
  
diff --git a/bindings/ruby/tests/test_whisper.rb b/bindings/ruby/tests/test_whisper.rb

index fa6a3e2d4e8f8802600673bbb0b0c0b1005f8e88..3700671bce6e15694a7f6daad8ab075a8e274641 100644 (file)
--- a/bindings/ruby/tests/test_whisper.rb
+++ b/bindings/ruby/tests/test_whisper.rb
@@ -117,13 +117,6 @@ class TestWhisper < Test::Unit::TestCase
      assert !@params.split_on_word
    end
  
-  def test_speed_up
-    @params.speed_up = true
-    assert @params.speed_up
-    @params.speed_up = false
-    assert !@params.speed_up
-  end
-
    def test_whisper
      @whisper = Whisper::Context.new(File.join(TOPDIR, '..', '..', 'models', 'ggml-base.en.bin'))
      params  = Whisper::Params.new
diff --git a/examples/addon.node/addon.cpp b/examples/addon.node/addon.cpp

index 53bf1abb5a3ed07704fd5a8a3374712788c3968a..4ada6ca508489e42729caf8fe01ebe1f38abbc01 100644 (file)
--- a/examples/addon.node/addon.cpp
+++ b/examples/addon.node/addon.cpp
@@ -25,7 +25,6 @@ struct whisper_params {
      float entropy_thold = 2.4f;
      float logprob_thold = -1.0f;
  
-    bool speed_up       = false;
      bool translate      = false;
      bool diarize        = false;
      bool output_txt     = false;
@@ -232,8 +231,6 @@ int run(whisper_params &params, std::vector<std::vector<std::string>> &result) {
              wparams.max_len          = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
              wparams.audio_ctx        = params.audio_ctx;
  
-            wparams.speed_up         = params.speed_up;
-
              wparams.greedy.best_of        = params.best_of;
              wparams.beam_search.beam_size = params.beam_size;
  
diff --git a/examples/command/command.cpp b/examples/command/command.cpp

index cd6cc02399456fcdd861d7bb15805bee877dce1b..84424d4331b2aeca3be4331963523dfd0625c15f 100644 (file)
--- a/examples/command/command.cpp
+++ b/examples/command/command.cpp
@@ -38,7 +38,6 @@ struct whisper_params {
  
      grammar_parser::parse_state grammar_parsed;
  
-    bool speed_up      = false;
      bool translate     = false;
      bool print_special = false;
      bool print_energy  = false;
@@ -76,7 +75,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
          else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
          else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
          else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
          else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
          else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
          else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
@@ -115,7 +113,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
      fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
      fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
      fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
      fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
      fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
      fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@@ -165,7 +162,6 @@ std::string transcribe(
      wparams.n_threads        = params.n_threads;
  
      wparams.audio_ctx = params.audio_ctx;
-    wparams.speed_up  = params.speed_up;
  
      wparams.temperature     = 0.4f;
      wparams.temperature_inc = 1.0f;
@@ -371,7 +367,6 @@ int process_command_list(struct whisper_context * ctx, audio_async &audio, const
              wparams.n_threads        = params.n_threads;
  
              wparams.audio_ctx        = params.audio_ctx;
-            wparams.speed_up         = params.speed_up;
  
              wparams.prompt_tokens    = k_tokens.data();
              wparams.prompt_n_tokens  = k_tokens.size();
diff --git a/examples/common.h b/examples/common.h

index 2ed91ca9aa80dc2b3a152a278c71d197314201ed..de895858ab066bc8b88859578f74a863392fd6da 100644 (file)
--- a/examples/common.h
+++ b/examples/common.h
@@ -185,7 +185,7 @@ private:
      // It is assumed that PCM data is normalized to a range from -1 to 1
      bool write_audio(const float * data, size_t length) {
          for (size_t i = 0; i < length; ++i) {
-            const int16_t intSample = data[i] * 32767;
+            const int16_t intSample = int16_t(data[i] * 32767);
              file.write(reinterpret_cast<const char *>(&intSample), sizeof(int16_t));
              dataSize += sizeof(int16_t);
          }
diff --git a/examples/lsp/lsp.cpp b/examples/lsp/lsp.cpp

index 3df54266a251d6d316d68f0e611f1f85ec0b2b46..8cca87151bf9fda39213e90b6449b43da79f20f1 100644 (file)
--- a/examples/lsp/lsp.cpp
+++ b/examples/lsp/lsp.cpp
@@ -26,7 +26,6 @@ struct whisper_params {
      float vad_thold    = 0.6f;
      float freq_thold   = 100.0f;
  
-    bool speed_up      = false;
      bool translate     = false;
      bool print_special = false;
      bool print_energy  = false;
@@ -70,7 +69,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
          else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
          else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
          else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
          else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
          else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
          else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
@@ -102,7 +100,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
      fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
      fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
      fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
      fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
      fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
      fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@@ -184,7 +181,6 @@ json unguided_transcription(struct whisper_context * ctx, audio_async &audio, js
      wparams.n_threads        = params.n_threads;
  
      wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
      wparams.suppress_non_speech_tokens = true;
      // run the transformer and a single decoding pass
      if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
@@ -223,7 +219,6 @@ json guided_transcription(struct whisper_context * ctx, audio_async &audio, cons
      wparams.n_threads        = params.n_threads;
  
      wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
  
      // TODO: Do some time testing. Does an overly long prompt slow down processing?
      // Set up command sets/precompute prompts
diff --git a/examples/main/main.cpp b/examples/main/main.cpp

index 45eb17fe7f327aada7bce233a60b695b04599bd2..bb9b7b79ce5e68a314138ae89a2547f64d82081a 100644 (file)
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -47,7 +47,6 @@ struct whisper_params {
      float temperature     = 0.0f;
      float temperature_inc = 0.2f;
  
-    bool speed_up        = false;
      bool debug_mode      = false;
      bool translate       = false;
      bool detect_language = false;
@@ -138,7 +137,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
          else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
          else if (arg == "-tp"   || arg == "--temperature")     { params.temperature     = std::stof(argv[++i]); }
          else if (arg == "-tpi"  || arg == "--temperature-inc") { params.temperature_inc = std::stof(argv[++i]); }
-        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
          else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
          else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
          else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
@@ -206,7 +204,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
      fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
      fprintf(stderr, "  -tp,       --temperature N     [%-7.2f] The sampling temperature, between 0 and 1\n",    params.temperature);
      fprintf(stderr, "  -tpi,      --temperature-inc N [%-7.2f] The increment of temperature, between 0 and 1\n",params.temperature_inc);
-    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
      fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
      fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
      fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
@@ -1106,7 +1103,6 @@ int main(int argc, char ** argv) {
              wparams.split_on_word    = params.split_on_word;
              wparams.audio_ctx        = params.audio_ctx;
  
-            wparams.speed_up         = params.speed_up;
              wparams.debug_mode       = params.debug_mode;
  
              wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 2efa4c7a0202a4312032753e92556d3d0589f162..10aae9c04d3d6e9c353f227aad4349569e369ebe 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -61,7 +61,6 @@ struct whisper_params {
      float temperature     =  0.00f;
      float temperature_inc =  0.20f;
  
-    bool speed_up        = false;
      bool debug_mode      = false;
      bool translate       = false;
      bool detect_language = false;
@@ -112,7 +111,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
      fprintf(stderr, "  -wt N,     --word-thold N      [%-7.2f] word timestamp probability threshold\n",         params.word_thold);
      fprintf(stderr, "  -et N,     --entropy-thold N   [%-7.2f] entropy threshold for decoder fail\n",           params.entropy_thold);
      fprintf(stderr, "  -lpt N,    --logprob-thold N   [%-7.2f] log probability threshold for decoder fail\n",   params.logprob_thold);
-    // fprintf(stderr, "  -su,       --speed-up          [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
      fprintf(stderr, "  -debug,    --debug-mode        [%-7s] enable debug mode (eg. dump log_mel)\n",           params.debug_mode ? "true" : "false");
      fprintf(stderr, "  -tr,       --translate         [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
      fprintf(stderr, "  -di,       --diarize           [%-7s] stereo audio diarization\n",                       params.diarize ? "true" : "false");
@@ -159,7 +157,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
          else if (arg == "-wt"   || arg == "--word-thold")      { params.word_thold      = std::stof(argv[++i]); }
          else if (arg == "-et"   || arg == "--entropy-thold")   { params.entropy_thold   = std::stof(argv[++i]); }
          else if (arg == "-lpt"  || arg == "--logprob-thold")   { params.logprob_thold   = std::stof(argv[++i]); }
-        // else if (arg == "-su"   || arg == "--speed-up")        { params.speed_up        = true; }
          else if (arg == "-debug"|| arg == "--debug-mode")      { params.debug_mode      = true; }
          else if (arg == "-tr"   || arg == "--translate")       { params.translate       = true; }
          else if (arg == "-di"   || arg == "--diarize")         { params.diarize         = true; }
@@ -768,7 +765,6 @@ int main(int argc, char ** argv) {
              wparams.split_on_word    = params.split_on_word;
              wparams.audio_ctx        = params.audio_ctx;
  
-            wparams.speed_up         = params.speed_up;
              wparams.debug_mode       = params.debug_mode;
  
              wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
diff --git a/examples/stream/stream.cpp b/examples/stream/stream.cpp

index 60c1b0894e44edbb09f32eb13927b61f4df22716..50797e96daa5aaff8a5e3742342f527373ed703f 100644 (file)
--- a/examples/stream/stream.cpp
+++ b/examples/stream/stream.cpp
@@ -27,7 +27,6 @@ struct whisper_params {
      float vad_thold    = 0.6f;
      float freq_thold   = 100.0f;
  
-    bool speed_up      = false;
      bool translate     = false;
      bool no_fallback   = false;
      bool print_special = false;
@@ -62,7 +61,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
          else if (arg == "-ac"   || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
          else if (arg == "-vth"  || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
          else if (arg == "-fth"  || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"   || arg == "--speed-up")      { params.speed_up      = true; }
          else if (arg == "-tr"   || arg == "--translate")     { params.translate     = true; }
          else if (arg == "-nf"   || arg == "--no-fallback")   { params.no_fallback   = true; }
          else if (arg == "-ps"   || arg == "--print-special") { params.print_special = true; }
@@ -100,7 +98,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
      fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                   params.audio_ctx);
      fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",           params.vad_thold);
      fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                   params.freq_thold);
-    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",        params.speed_up ? "true" : "false");
      fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",      params.translate ? "true" : "false");
      fprintf(stderr, "  -nf,      --no-fallback   [%-7s] do not use temperature fallback while decoding\n", params.no_fallback ? "true" : "false");
      fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
@@ -314,7 +311,6 @@ int main(int argc, char ** argv) {
              wparams.n_threads        = params.n_threads;
  
              wparams.audio_ctx        = params.audio_ctx;
-            wparams.speed_up         = params.speed_up;
  
              wparams.tdrz_enable      = params.tinydiarize; // [TDRZ]
  
diff --git a/examples/talk-llama/talk-llama.cpp b/examples/talk-llama/talk-llama.cpp

index 4aab62b9a6f1cfa4872ea986eff4e4ab2fb58ac1..b15be0b2789e05b698d962c3e10e03f9e8e25a92 100644 (file)
--- a/examples/talk-llama/talk-llama.cpp
+++ b/examples/talk-llama/talk-llama.cpp
@@ -59,7 +59,6 @@ struct whisper_params {
      float vad_thold  = 0.6f;
      float freq_thold = 100.0f;
  
-    bool speed_up       = false;
      bool translate      = false;
      bool print_special  = false;
      bool print_energy   = false;
@@ -100,7 +99,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
          else if (arg == "-ngl" || arg == "--n-gpu-layers")   { params.n_gpu_layers   = std::stoi(argv[++i]); }
          else if (arg == "-vth" || arg == "--vad-thold")      { params.vad_thold      = std::stof(argv[++i]); }
          else if (arg == "-fth" || arg == "--freq-thold")     { params.freq_thold     = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")       { params.speed_up       = true; }
          else if (arg == "-tr"  || arg == "--translate")      { params.translate      = true; }
          else if (arg == "-ps"  || arg == "--print-special")  { params.print_special  = true; }
          else if (arg == "-pe"  || arg == "--print-energy")   { params.print_energy   = true; }
@@ -149,7 +147,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
      fprintf(stderr, "  -ngl N,   --n-gpu-layers N [%-7d] number of layers to store in VRAM\n",           params.n_gpu_layers);
      fprintf(stderr, "  -vth N,   --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
      fprintf(stderr, "  -fth N,   --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,      --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
      fprintf(stderr, "  -tr,      --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
      fprintf(stderr, "  -ps,      --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
      fprintf(stderr, "  -pe,      --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@@ -205,7 +202,6 @@ std::string transcribe(
      wparams.prompt_n_tokens  = prompt_tokens.empty() ? 0       : prompt_tokens.size();
  
      wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
  
      if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
          return "";
diff --git a/examples/talk/talk.cpp b/examples/talk/talk.cpp

index 3e34e5724ff33420ae24d7bedeaf184f6c806dc0..b34fad6c2bb55504cbc4e44e9ec52510d4cbe14a 100644 (file)
--- a/examples/talk/talk.cpp
+++ b/examples/talk/talk.cpp
@@ -26,7 +26,6 @@ struct whisper_params {
      float vad_thold    = 0.6f;
      float freq_thold   = 100.0f;
  
-    bool speed_up      = false;
      bool translate     = false;
      bool print_special = false;
      bool print_energy  = false;
@@ -60,7 +59,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
          else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
          else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
          else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
          else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
          else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
          else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
@@ -96,7 +94,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
      fprintf(stderr, "  -ac N,    --audio-ctx N   [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
      fprintf(stderr, "  -vth N,   --vad-thold N   [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
      fprintf(stderr, "  -fth N,   --freq-thold N  [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,      --speed-up      [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
      fprintf(stderr, "  -tr,      --translate     [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
      fprintf(stderr, "  -ps,      --print-special [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
      fprintf(stderr, "  -pe,      --print-energy  [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@@ -132,7 +129,6 @@ std::string transcribe(whisper_context * ctx, const whisper_params & params, con
      wparams.n_threads        = params.n_threads;
  
      wparams.audio_ctx        = params.audio_ctx;
-    wparams.speed_up         = params.speed_up;
  
      if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
          return "";
diff --git a/examples/wchess/wchess.cmd/wchess.cmd.cpp b/examples/wchess/wchess.cmd/wchess.cmd.cpp

index 09e53f13172b8c5a4c7213bff5c57f89c499f336..4d04997631510d58189aaaded59f9357f0359ce0 100644 (file)
--- a/examples/wchess/wchess.cmd/wchess.cmd.cpp
+++ b/examples/wchess/wchess.cmd/wchess.cmd.cpp
@@ -26,7 +26,6 @@ struct whisper_params {
  
      float grammar_penalty = 100.0f;
  
-    bool speed_up      = false;
      bool translate     = false;
      bool print_special = false;
      bool print_energy  = false;
@@ -57,7 +56,6 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
      fprintf(stderr, "  -ac N,      --audio-ctx N    [%-7d] audio context size (0 - all)\n",                params.audio_ctx);
      fprintf(stderr, "  -vth N,     --vad-thold N    [%-7.2f] voice activity detection threshold\n",        params.vad_thold);
      fprintf(stderr, "  -fth N,     --freq-thold N   [%-7.2f] high-pass frequency cutoff\n",                params.freq_thold);
-    fprintf(stderr, "  -su,        --speed-up       [%-7s] speed up audio by x2 (reduced accuracy)\n",     params.speed_up ? "true" : "false");
      fprintf(stderr, "  -tr,        --translate      [%-7s] translate from source language to english\n",   params.translate ? "true" : "false");
      fprintf(stderr, "  -ps,        --print-special  [%-7s] print special tokens\n",                        params.print_special ? "true" : "false");
      fprintf(stderr, "  -pe,        --print-energy   [%-7s] print sound energy (for debugging)\n",          params.print_energy ? "true" : "false");
@@ -89,7 +87,6 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
          else if (arg == "-ac"  || arg == "--audio-ctx")     { params.audio_ctx     = std::stoi(argv[++i]); }
          else if (arg == "-vth" || arg == "--vad-thold")     { params.vad_thold     = std::stof(argv[++i]); }
          else if (arg == "-fth" || arg == "--freq-thold")    { params.freq_thold    = std::stof(argv[++i]); }
-        else if (arg == "-su"  || arg == "--speed-up")      { params.speed_up      = true; }
          else if (arg == "-tr"  || arg == "--translate")     { params.translate     = true; }
          else if (arg == "-ps"  || arg == "--print-special") { params.print_special = true; }
          else if (arg == "-pe"  || arg == "--print-energy")  { params.print_energy  = true; }
diff --git a/whisper.cpp b/whisper.cpp

index a22da8896bb055e09e1e9732ed6b47a9924422d2..dbb235e9f43b88c81aa968e5963d0940ce29f2b4 100644 (file)
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -2868,13 +2868,10 @@ struct whisper_global_cache {
      // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
      // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
      float hann_window[WHISPER_N_FFT];
-    float hann_window2x[WHISPER_N_FFT * 2];
  
      whisper_global_cache() {
          fill_sin_cos_table();
-#define FILL_HANN_WINDOW(arr) fill_hann_window(sizeof(arr) / sizeof(arr[0]), true, arr)
-        FILL_HANN_WINDOW(hann_window);
-        FILL_HANN_WINDOW(hann_window2x);
+        fill_hann_window(sizeof(hann_window)/sizeof(hann_window[0]), true, hann_window);
      }
  
      void fill_sin_cos_table() {
@@ -2885,7 +2882,7 @@ struct whisper_global_cache {
          }
      }
  
-    void fill_hann_window(int length, bool periodic, float* output) {
+    void fill_hann_window(int length, bool periodic, float * output) {
          int offset = -1;
          if (periodic) {
              offset = 0;
@@ -3061,15 +3058,8 @@ static bool log_mel_spectrogram(
      const int64_t t_start_us = ggml_time_us();
  
      // Hann window
-    const float * hann = nullptr;
-    if (frame_size == WHISPER_N_FFT) {
-        hann = global_cache.hann_window;
-    } else if (frame_size == 2 * WHISPER_N_FFT) {
-        hann = global_cache.hann_window2x;
-    } else {
-        WHISPER_ASSERT(false && "Unsupported frame_size");
-        return false;
-    }
+    WHISPER_ASSERT(frame_size == WHISPER_N_FFT && "Unsupported frame_size");
+    const float * hann = global_cache.hann_window;
  
      // Calculate the length of padding
      int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
@@ -3752,30 +3742,6 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
      return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads);
  }
  
-// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
-int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
-    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
-        WHISPER_LOG_ERROR("%s: failed to compute mel spectrogram\n", __func__);
-        return -1;
-    }
-
-    return 0;
-}
-
-// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
-int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
-    return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads);
-}
-
-// same as whisper_pcm_to_mel, but applies WSOLA to speed up the audio x2
-// TODO
-
-// same as whisper_pcm_to_mel, but applies HPTSM to speed up the audio x2
-// TODO
-
-// same as whisper_pcm_to_mel, but applies PV (with phase lock) to speed up the audio x2
-// TODO
-
  int whisper_set_mel_with_state(
          struct whisper_context * ctx,
            struct whisper_state * state,
@@ -4676,7 +4642,6 @@ struct whisper_full_params whisper_full_default_params(enum whisper_sampling_str
          /*.split_on_word     =*/ false,
          /*.max_tokens        =*/ 0,
  
-        /*.speed_up          =*/ false,
          /*.debug_mode        =*/ false,
          /*.audio_ctx         =*/ 0,
  
@@ -5350,15 +5315,9 @@ int whisper_full_with_state(
  
      if (n_samples > 0) {
          // compute log mel spectrogram
-        if (params.speed_up) {
-            // TODO: Replace PV with more advanced algorithm
+        if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
              WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
-            return -1;
-        } else {
-            if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
-                WHISPER_LOG_ERROR("%s: failed to compute log mel spectrogram\n", __func__);
-                return -2;
-            }
+            return -2;
          }
      }
  
@@ -5395,7 +5354,7 @@ int whisper_full_with_state(
      // if length of spectrogram is less than 1.0s (100 frames), then return
      // basically don't process anything that is less than 1.0s
      // see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
-    if (seek_end < seek_start + (params.speed_up ? 50 : 100)) {
+    if (seek_end < seek_start + 100) {
          WHISPER_LOG_WARN("%s: input is too short - %d ms < 1000 ms. consider padding the input audio with silence\n", __func__, (seek_end - seek_start)*10);
          return 0;
      }
@@ -6107,8 +6066,8 @@ int whisper_full_with_state(
                          const auto t1 = seek + 2*(tokens_cur[i].tid - whisper_token_beg(ctx));
  
                          if (!text.empty()) {
-                            const auto tt0 = params.speed_up ? 2*t0 : t0;
-                            const auto tt1 = params.speed_up ? 2*t1 : t1;
+                            const auto tt0 = t0;
+                            const auto tt1 = t1;
  
                              if (params.print_realtime) {
                                  if (params.print_timestamps) {
@@ -6154,8 +6113,8 @@ int whisper_full_with_state(
                  if (!text.empty()) {
                      const auto t1 = seek + seek_delta;
  
-                    const auto tt0 = params.speed_up ? 2*t0 : t0;
-                    const auto tt1 = params.speed_up ? 2*t1 : t1;
+                    const auto tt0 = t0;
+                    const auto tt1 = t1;
  
                      if (params.print_realtime) {
                          if (params.print_timestamps) {
diff --git a/whisper.h b/whisper.h

index 9c7c58d874b0c6be7fd7bb23ff1ba79cd0b9dc4b..2b3d5e574cbc67aa23e502fc4b0b2cee83707eda 100644 (file)
--- a/whisper.h
+++ b/whisper.h
@@ -266,22 +266,6 @@ extern "C" {
                                 int   n_samples,
                                 int   n_threads);
  
-    // Convert RAW PCM audio to log mel spectrogram but applies a Phase Vocoder to speed up the audio x2.
-    // The resulting spectrogram is stored inside the default state of the provided whisper context.
-    // Returns 0 on success
-    WHISPER_API int whisper_pcm_to_mel_phase_vocoder(
-        struct whisper_context * ctx,
-                   const float * samples,
-                           int   n_samples,
-                           int   n_threads);
-
-    WHISPER_API int whisper_pcm_to_mel_phase_vocoder_with_state(
-        struct whisper_context * ctx,
-          struct whisper_state * state,
-                   const float * samples,
-                           int   n_samples,
-                           int   n_threads);
-
      // This can be used to set a custom log mel spectrogram inside the default state of the provided whisper context.
      // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
      // n_mel must be 80
@@ -499,7 +483,6 @@ extern "C" {
  
          // [EXPERIMENTAL] speed-up techniques
          // note: these can significantly reduce the quality of the output
-        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
          bool debug_mode;        // enable debug_mode provides extra info (eg. Dump log_mel)
          int  audio_ctx;         // overwrite the audio context size (0 = use default)
author	Borislav Stanimirov <redacted>
	Fri, 31 May 2024 08:37:29 +0000 (11:37 +0300)
committer	GitHub <redacted>
	Fri, 31 May 2024 08:37:29 +0000 (11:37 +0300)
bindings/go/examples/go-whisper/flags.go		patch \| blob \| history
bindings/go/params.go		patch \| blob \| history
bindings/go/pkg/whisper/context.go		patch \| blob \| history
bindings/go/pkg/whisper/interface.go		patch \| blob \| history
bindings/java/src/main/java/io/github/ggerganov/whispercpp/WhisperCppJnaLibrary.java		patch \| blob \| history
bindings/java/src/main/java/io/github/ggerganov/whispercpp/params/WhisperFullParams.java		patch \| blob \| history
bindings/ruby/ext/ruby_whisper.cpp		patch \| blob \| history
bindings/ruby/tests/test_whisper.rb		patch \| blob \| history
examples/addon.node/addon.cpp		patch \| blob \| history
examples/command/command.cpp		patch \| blob \| history
examples/common.h		patch \| blob \| history
examples/lsp/lsp.cpp		patch \| blob \| history
examples/main/main.cpp		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
examples/stream/stream.cpp		patch \| blob \| history
examples/talk-llama/talk-llama.cpp		patch \| blob \| history
examples/talk/talk.cpp		patch \| blob \| history
examples/wchess/wchess.cmd/wchess.cmd.cpp		patch \| blob \| history
whisper.cpp		patch \| blob \| history
whisper.h		patch \| blob \| history