whisper : add support for large v3 (#1444)

author Georgi Gerganov <redacted>

Tue, 7 Nov 2023 13:30:18 +0000 (15:30 +0200)

committer GitHub <redacted>

Tue, 7 Nov 2023 13:30:18 +0000 (15:30 +0200)
author Georgi Gerganov <redacted>
Tue, 7 Nov 2023 13:30:18 +0000 (15:30 +0200)
committer GitHub <redacted>
Tue, 7 Nov 2023 13:30:18 +0000 (15:30 +0200)
diff --git a/Makefile b/Makefile

index e9a97af490db59347144e5e0325302a8f9c66339..d134b768bc6a659ed842dde4a6d6d333972a4bf5 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -417,9 +417,10 @@ samples:
  .PHONY: medium.en
  .PHONY: medium
  .PHONY: large-v1
+.PHONY: large-v2
  .PHONY: large
  
-tiny.en tiny base.en base small.en small medium.en medium large-v1 large: main
+tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large: main
         bash ./models/download-ggml-model.sh $@
         @echo ""
         @echo "==============================================="
diff --git a/README.md b/README.md

index 4267f418d16477bb625a33c8f44c77846374ef8c..988785cf92c8f4e4d52e351716c4b91d3de7fe38 100644 (file)
--- a/README.md
+++ b/README.md
@@ -234,6 +234,7 @@ make small
  make medium.en
  make medium
  make large-v1
+make large-v2
  make large
  ```
  
@@ -245,7 +246,7 @@ make large
  | base   | 142 MB | ~210 MB | `465707469ff3a37a2b9b8d8f89f2f99de7299dac` |
  | small  | 466 MB | ~600 MB | `55356645c2b361a969dfd0ef2c5a50d530afd8d5` |
  | medium | 1.5 GB | ~1.7 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
-| large  | 2.9 GB | ~3.3 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
+| large  | 2.9 GB | ~3.3 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
  
  ## Quantization
  
diff --git a/bindings/go/examples/go-model-download/main.go b/bindings/go/examples/go-model-download/main.go

index 67462a581d3cc09709df0613683aeebcad2d00de..d3e45c28ea3ecc5ef438f7d6fd76695a000b6e67 100644 (file)
--- a/bindings/go/examples/go-model-download/main.go
+++ b/bindings/go/examples/go-model-download/main.go
@@ -24,7 +24,7 @@ const (
  
  var (
         // The models which will be downloaded, if no model is specified as an argument
-       modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large"}
+       modelNames = []string{"ggml-tiny.en", "ggml-tiny", "ggml-base.en", "ggml-base", "ggml-small.en", "ggml-small", "ggml-medium.en", "ggml-medium", "ggml-large-v1", "ggml-large-v2", "ggml-large"}
  )
  
  var (
diff --git a/bindings/go/whisper.go b/bindings/go/whisper.go

index b77e103c4e3cd8ef9efd0741d30250035da8beee..9660662084f53d99aa1cfd515ca5cc21da65f61e 100644 (file)
--- a/bindings/go/whisper.go
+++ b/bindings/go/whisper.go
@@ -83,7 +83,6 @@ const (
         SampleRate = C.WHISPER_SAMPLE_RATE                 // Expected sample rate, samples per second
         SampleBits = uint16(unsafe.Sizeof(C.float(0))) * 8 // Sample size in bits
         NumFFT     = C.WHISPER_N_FFT
-       NumMEL     = C.WHISPER_N_MEL
         HopLength  = C.WHISPER_HOP_LENGTH
         ChunkSize  = C.WHISPER_CHUNK_SIZE
  )
diff --git a/examples/bench.wasm/emscripten.cpp b/examples/bench.wasm/emscripten.cpp

index 3624bbc48b1183a198538bf43d7eb1c0870788af..083397db0573a4aa68a73998f54428f4ca1f22d6 100644 (file)
--- a/examples/bench.wasm/emscripten.cpp
+++ b/examples/bench.wasm/emscripten.cpp
@@ -23,7 +23,9 @@ void bench_main(size_t index) {
  
      fprintf(stderr, "%s: running benchmark with %d threads - please wait...\n", __func__, n_threads);
  
-    if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {
+    const int n_mels = whisper_model_n_mels(ctx);
+
+    if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) {
          fprintf(stderr, "error: failed to set mel: %d\n", ret);
          return;
      }
diff --git a/examples/bench/bench.cpp b/examples/bench/bench.cpp

index 9f50b3b622400f75ea34cc83dcbd82914a8bac13..db1c4e800cd74983035af5e5e343128ef31ddfb5 100644 (file)
--- a/examples/bench/bench.cpp
+++ b/examples/bench/bench.cpp
@@ -73,7 +73,9 @@ int whisper_bench_full(const whisper_params & params) {
          return 2;
      }
  
-    if (int ret = whisper_set_mel(ctx, nullptr, 0, WHISPER_N_MEL)) {
+    const int n_mels = whisper_model_n_mels(ctx);
+
+    if (int ret = whisper_set_mel(ctx, nullptr, 0, n_mels)) {
          fprintf(stderr, "error: failed to set mel: %d\n", ret);
          return 3;
      }
diff --git a/examples/livestream.sh b/examples/livestream.sh

index 42d0102fd58ee0f970676a0ce14e1a3fbff5aa12..d86a7c601d5ce39dd882d575701cfa16c5cc7d04 100755 (executable)
--- a/examples/livestream.sh
+++ b/examples/livestream.sh
@@ -48,7 +48,7 @@ if [ -n "$3" ]; then
  fi
  
  # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
  
  # list available models
  function list_models {
diff --git a/examples/twitch.sh b/examples/twitch.sh

index c185fb24f1632568930ba8f6a872ac9edc48793e..77b618dde9b9e1566069163f382baa4c79771db8 100755 (executable)
--- a/examples/twitch.sh
+++ b/examples/twitch.sh
@@ -21,7 +21,7 @@ help()
      echo "Usage: ./twitch.sh -s [step] -m [model] -t [threads] [url]"
      echo "options:"
      echo "-s       Step in seconds (default is $step)."
-    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large' (default is '$model')."
+    echo "-m       Choose model, options are: 'tiny.en' 'tiny' 'base.en' 'base' 'small.en' 'small' 'medium.en' 'medium' 'large-v1' 'large-v2' 'large' (default is '$model')."
      echo "-t       Number of threads to use."
      echo "-h       Print this help page."
      echo
diff --git a/extra/convert-all.sh b/extra/convert-all.sh

index c5ba9094d7b805ee019c3907de7337f2a1d05f7b..c9638079c9296300b6549d16aa7ee04942b7e8b4 100755 (executable)
--- a/extra/convert-all.sh
+++ b/extra/convert-all.sh
@@ -1,6 +1,6 @@
  #!/bin/bash
  
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
  
  for model in "${models[@]}"; do
      python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/
diff --git a/models/README.md b/models/README.md

index 10446a63d5fc554c14e78cc9bd15eb40c80c46a3..b12f2d22d63fbc422b43d6a0ca2fb15a8421584d 100644 (file)
--- a/models/README.md
+++ b/models/README.md
@@ -50,7 +50,8 @@ https://huggingface.co/ggerganov/whisper.cpp/tree/main
  | medium    | 1.5 GB | ~2.6 GB | `fd9727b6e1217c2f614f9b698455c4ffd82463b4` |
  | medium.en | 1.5 GB | ~2.6 GB | `8c30f0e44ce9560643ebd10bbe50cd20eafd3723` |
  | large-v1  | 2.9 GB | ~4.7 GB | `b1caaf735c4cc1429223d5a74f0f4d0b9b59a299` |
-| large     | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
+| large-v2  | 2.9 GB | ~4.7 GB | `0f4c8e34f21cf1a914c59d8b3ce882345ad349d6` |
+| large     | 2.9 GB | ~4.7 GB | `ad82bf6a9043ceed055076d0fd39f5f186ff8062` |
  
  ## Model files for testing purposes
  
diff --git a/models/convert-h5-to-coreml.py b/models/convert-h5-to-coreml.py

index 93c797ba5b2a2ae82176fb869ffcbd1b6b626ceb..3887c22a7e40c8dcddc14a7ee2fcc34f3d81772e 100644 (file)
--- a/models/convert-h5-to-coreml.py
+++ b/models/convert-h5-to-coreml.py
@@ -78,14 +78,14 @@ def convert_hf_whisper(hf_model_name_or_path: str, whisper_state_path: str):
  # Ported from models/convert-whisper-to-coreml.py
  if __name__ == "__main__":
      parser = argparse.ArgumentParser()
-    parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
+    parser.add_argument("--model-name", type=str, help="name of model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
      parser.add_argument("--model-path", type=str, help="path to the model (e.g. if published on HuggingFace: Oblivion208/whisper-tiny-cantonese)", required=True)
      parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
      parser.add_argument("--quantize",     type=bool, help="quantize weights to F16", default=False)
      parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
      args = parser.parse_args()
  
-    if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
+    if args.model_name not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
          raise ValueError("Invalid model name")
  
      pt_target_path = f"models/hf-{args.model_name}.pt"
diff --git a/models/convert-pt-to-ggml.py b/models/convert-pt-to-ggml.py

index 9aa134b53f7d05c1f9d2be60759f28b14e87fdc6..7a3daf238a8e92cd4067a8974b0b5abdf3a0f3d9 100644 (file)
--- a/models/convert-pt-to-ggml.py
+++ b/models/convert-pt-to-ggml.py
@@ -228,7 +228,7 @@ with np.load(dir_whisper / "whisper" / "assets" / "mel_filters.npz") as f:
  # for backwards compatibility, also check for older hf_transformers format tokenizer files
  # old format: dir_whisper/whisper/assets/[multilingual/gpt2]/vocab.json
  # new format: dir_whisper/whisper/assets/[multilingual/gpt2].tiktoken
-multilingual = hparams["n_vocab"] == 51865
+multilingual = hparams["n_vocab"] >= 51865
  tokenizer = dir_whisper / "whisper" / "assets" / (multilingual and "multilingual.tiktoken" or "gpt2.tiktoken")
  tokenizer_type = "tiktoken"
  if not tokenizer.is_file():
diff --git a/models/convert-whisper-to-coreml.py b/models/convert-whisper-to-coreml.py

index d4a7805209af3dbc91aaf30fba37a2d4b60d873d..adbbd1099cb44acaaa739c814e34987139f5d1c7 100644 (file)
--- a/models/convert-whisper-to-coreml.py
+++ b/models/convert-whisper-to-coreml.py
@@ -194,7 +194,7 @@ class TextDecoderANE(TextDecoder):
          x = x.permute(0,2,3,1).squeeze(0)
  
          # ANE can only load tensors with dim size of at most 16,384 - whisper uses 51,864 (en) or 51,865 (multi-lang) tokens so we need to compute in chunks
-        if self.token_embedding.weight.shape[0] == 51865:
+        if self.token_embedding.weight.shape[0] >= 51865:
              # split in 11 chunks - 4715 each
              splits = self.token_embedding.weight.split(self.token_embedding.weight.shape[0]//11, dim=0)
              logits = torch.cat([torch.einsum('bid,jd->bij', x, split) for split in splits]).view(*x.shape[:2], -1)
@@ -296,13 +296,13 @@ def convert_decoder(hparams, model, quantize=False):
  
  if __name__ == "__main__":
      parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
+    parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
      parser.add_argument("--encoder-only", type=bool, help="only convert encoder", default=False)
      parser.add_argument("--quantize",     type=bool, help="quantize weights to F16", default=False)
      parser.add_argument("--optimize-ane", type=bool, help="optimize for ANE execution (currently broken)", default=False)
      args = parser.parse_args()
  
-    if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
+    if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
          raise ValueError("Invalid model name")
  
      whisper = load_model(args.model).cpu()
diff --git a/models/convert-whisper-to-openvino.py b/models/convert-whisper-to-openvino.py

index cdee571b11feaa7941f7b7218305c46b0202eb3f..6b3d396643bad8affd56dcd98c1f8cb885d5a0d5 100644 (file)
--- a/models/convert-whisper-to-openvino.py
+++ b/models/convert-whisper-to-openvino.py
@@ -38,10 +38,10 @@ def convert_encoder(hparams, encoder, mname):
  
  if __name__ == "__main__":
      parser = argparse.ArgumentParser()
-    parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1)", required=True)
+    parser.add_argument("--model", type=str, help="model to convert (e.g. tiny, tiny.en, base, base.en, small, small.en, medium, medium.en, large, large-v1, large-v2)", required=True)
      args = parser.parse_args()
  
-    if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1"]:
+    if args.model not in ["tiny", "tiny.en", "base", "base.en", "small", "small.en", "medium", "medium.en", "large", "large-v1", "large-v2"]:
          raise ValueError("Invalid model name")
  
      whisper = load_model(args.model).cpu()
diff --git a/models/download-coreml-model.sh b/models/download-coreml-model.sh

index d46789d7c067908fbfedaf19c8e59af637696eec..95739dbf9957bdf9c64b86d30f448ecbb10a72cf 100755 (executable)
--- a/models/download-coreml-model.sh
+++ b/models/download-coreml-model.sh
@@ -19,7 +19,7 @@ function get_script_path() {
  models_path="$(get_script_path)"
  
  # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
  
  # list available models
  function list_models {
diff --git a/models/download-ggml-model.cmd b/models/download-ggml-model.cmd

index 9042e99b27558434f33a6fe946b57d137c9a7764..fc279967dac418a636f044b509a69e94877a32ee 100644 (file)
--- a/models/download-ggml-model.cmd
+++ b/models/download-ggml-model.cmd
@@ -8,7 +8,7 @@ popd
  set argc=0
  for %%x in (%*) do set /A argc+=1
  
-set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large
+set models=tiny.en tiny base.en base small.en small medium.en medium large-v1 large-v2 large
  
  if %argc% neq 1 (
    echo.
@@ -57,8 +57,8 @@ goto :eof
  :list_models
    echo.
    echo Available models:
-  (for %%a in (%models%) do ( 
-    echo %%a 
+  (for %%a in (%models%) do (
+    echo %%a
    ))
    echo.
    exit /b
diff --git a/models/download-ggml-model.sh b/models/download-ggml-model.sh

index 288e08d21e919ee17adcb36962b6dcfba1804a49..ea68da8936df288b629342e01f211b30166756db 100755 (executable)
--- a/models/download-ggml-model.sh
+++ b/models/download-ggml-model.sh
@@ -41,6 +41,7 @@ models=(
      "medium-q5_0"
      "medium.en-q5_0"
      "large-v1"
+    "large-v2"
      "large"
      "large-q5_0"
  )
diff --git a/tests/run-tests.sh b/tests/run-tests.sh

index 38fa5cea52bfe8ecc2f49e97c7743191c9f0b7c2..bf062dd6b8d29ba3e66dc76fcd13815975af8baa 100755 (executable)
--- a/tests/run-tests.sh
+++ b/tests/run-tests.sh
@@ -19,7 +19,7 @@
  cd `dirname $0`
  
  # Whisper models
-models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large" )
+models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large-v1" "large-v2" "large" )
  
  # list available models
  function list_models {
diff --git a/whisper.cpp b/whisper.cpp

index 7f4f69a91d158e17bbf183db0a2ffdec29184a26..b6300d5f03c04d5b9de2d87386a1d2028ba55986 100644 (file)
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -193,6 +193,15 @@ enum e_model {
      MODEL_LARGE,
  };
  
+static const std::map<e_model, std::string> g_model_name = {
+    { MODEL_UNKNOWN,  "unknown"  },
+    { MODEL_TINY,     "tiny"     },
+    { MODEL_BASE,     "base"     },
+    { MODEL_SMALL,    "small"    },
+    { MODEL_MEDIUM,   "medium"   },
+    { MODEL_LARGE,    "large"    },
+};
+
  static const std::map<std::string, std::pair<int, std::string>> g_lang = {
      { "en",  { 0,  "english",         } },
      { "zh",  { 1,  "chinese",         } },
@@ -293,6 +302,7 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
      { "ba",  { 96,  "bashkir",        } },
      { "jw",  { 97,  "javanese",       } },
      { "su",  { 98,  "sundanese",      } },
+    { "yue", { 99,  "cantonese",      } },
  };
  
  static const size_t MB = 1ull*1024*1024;
@@ -402,7 +412,11 @@ struct whisper_vocab {
      id token_beg        = 50363; // begin timestamps
  
      bool is_multilingual() const {
-        return n_vocab == 51865;
+        return n_vocab >= 51865;
+    }
+
+    int num_languages() const {
+        return n_vocab - 51765 - (is_multilingual() ? 1 : 0);
      }
  };
  
@@ -922,6 +936,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
  
          assert(hparams.n_text_state == hparams.n_audio_state);
  
+        std::string mver = "";
+
          if (hparams.n_audio_layer == 4) {
              model.type = e_model::MODEL_TINY;
          }
@@ -940,6 +956,10 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
  
          if (hparams.n_audio_layer == 32) {
              model.type = e_model::MODEL_LARGE;
+
+            if (hparams.n_vocab == 51866) {
+                mver = " v3";
+            }
          }
  
          const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
@@ -968,7 +988,7 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
          log("%s: n_mels        = %d\n", __func__, hparams.n_mels);
          log("%s: ftype         = %d\n", __func__, model.hparams.ftype);
          log("%s: qntvr         = %d\n", __func__, qntvr);
-        log("%s: type          = %d\n", __func__, model.type);
+        log("%s: type          = %d (%s%s)\n", __func__, model.type, g_model_name.at(model.type).c_str(), mver.c_str());
  
          // print memory requirements
          {
@@ -1039,13 +1059,17 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
          if (vocab.is_multilingual()) {
              vocab.token_eot++;
              vocab.token_sot++;
-            vocab.token_translate++;
-            vocab.token_transcribe++;
-            vocab.token_solm++;
-            vocab.token_prev++;
-            vocab.token_nosp++;
-            vocab.token_not++;
-            vocab.token_beg++;
+
+            // account for variable number of language tokens
+            const int dt = vocab.num_languages() - 98;
+
+            vocab.token_translate  += dt;
+            vocab.token_transcribe += dt;
+            vocab.token_solm       += dt;
+            vocab.token_prev       += dt;
+            vocab.token_nosp       += dt;
+            vocab.token_not        += dt;
+            vocab.token_beg        += dt;
          }
  
          if (n_vocab < model.hparams.n_vocab) {
@@ -1074,6 +1098,8 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con
                  vocab.id_to_token[i] = word;
              }
          }
+
+        log("%s: n_langs       = %d\n", __func__, vocab.num_languages());
      }
  
      size_t ctx_size = 0;
@@ -3281,7 +3307,7 @@ void whisper_free_params(struct whisper_full_params * params) {
  }
  
  int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
-    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
+    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
          log("%s: failed to compute mel spectrogram\n", __func__);
          return -1;
      }
@@ -3295,7 +3321,7 @@ int whisper_pcm_to_mel(struct whisper_context * ctx, const float * samples, int
  
  // same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
  int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
-    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
+    if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, ctx->model.filters.n_mel, n_threads, ctx->model.filters, false, state->mel)) {
          log("%s: failed to compute mel spectrogram\n", __func__);
          return -1;
      }
@@ -3318,13 +3344,13 @@ int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float *
  // TODO
  
  int whisper_set_mel_with_state(
-        struct whisper_context * /*ctx*/,
+        struct whisper_context * ctx,
            struct whisper_state * state,
                     const float * data,
                             int   n_len,
                             int   n_mel) {
-    if (n_mel != WHISPER_N_MEL) {
-        log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
+    if (n_mel != ctx->model.filters.n_mel) {
+        log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, ctx->model.filters.n_mel);
          return -1;
      }
  
diff --git a/whisper.h b/whisper.h

index 300fc4bac375e3eea94753d989eeed7c0e293a7e..ed1612b4bc8082ea1a47c509c85515113e0473e4 100644 (file)
--- a/whisper.h
+++ b/whisper.h
@@ -29,7 +29,6 @@
  
  #define WHISPER_SAMPLE_RATE 16000
  #define WHISPER_N_FFT       400
-#define WHISPER_N_MEL       80
  #define WHISPER_HOP_LENGTH  160
  #define WHISPER_CHUNK_SIZE  30
author	Georgi Gerganov <redacted>
	Tue, 7 Nov 2023 13:30:18 +0000 (15:30 +0200)
committer	GitHub <redacted>
	Tue, 7 Nov 2023 13:30:18 +0000 (15:30 +0200)
Makefile		patch \| blob \| history
README.md		patch \| blob \| history
bindings/go/examples/go-model-download/main.go		patch \| blob \| history
bindings/go/whisper.go		patch \| blob \| history
examples/bench.wasm/emscripten.cpp		patch \| blob \| history
examples/bench/bench.cpp		patch \| blob \| history
examples/livestream.sh		patch \| blob \| history
examples/twitch.sh		patch \| blob \| history
extra/convert-all.sh		patch \| blob \| history
models/README.md		patch \| blob \| history
models/convert-h5-to-coreml.py		patch \| blob \| history
models/convert-pt-to-ggml.py		patch \| blob \| history
models/convert-whisper-to-coreml.py		patch \| blob \| history
models/convert-whisper-to-openvino.py		patch \| blob \| history
models/download-coreml-model.sh		patch \| blob \| history
models/download-ggml-model.cmd		patch \| blob \| history
models/download-ggml-model.sh		patch \| blob \| history
tests/run-tests.sh		patch \| blob \| history
whisper.cpp		patch \| blob \| history
whisper.h		patch \| blob \| history