server : add Voice Activity Detection (VAD) support (#3246)

author Daniel Bevenius <redacted>

Fri, 13 Jun 2025 11:24:03 +0000 (13:24 +0200)

committer GitHub <redacted>

Fri, 13 Jun 2025 11:24:03 +0000 (13:24 +0200)
author Daniel Bevenius <redacted>
Fri, 13 Jun 2025 11:24:03 +0000 (13:24 +0200)
committer GitHub <redacted>
Fri, 13 Jun 2025 11:24:03 +0000 (13:24 +0200)
diff --git a/examples/server/README.md b/examples/server/README.md

index c8e2f714bceef449dc5758d33a616b36b6e7a411..ffba5f4edf5bf2bd2e002c9a29e407f45749f407 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -23,6 +23,7 @@ options:
    -sow,      --split-on-word     [false  ] split on word rather than on token
    -bo N,     --best-of N         [2      ] number of best candidates to keep
    -bs N,     --beam-size N       [-1     ] beam size for beam search
+  -ac N,     --audio-ctx N       [0      ] audio context size (0 - all)
    -wt N,     --word-thold N      [0.01   ] word timestamp probability threshold
    -et N,     --entropy-thold N   [2.40   ] entropy threshold for decoder fail
    -lpt N,    --logprob-thold N   [-1.00  ] log probability threshold for decoder fail
@@ -41,9 +42,28 @@ options:
               --prompt PROMPT     [       ] initial prompt
    -m FNAME,  --model FNAME       [models/ggml-base.en.bin] model path
    -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
+  -dtw MODEL --dtw MODEL         [       ] compute token-level timestamps
    --host HOST,                   [127.0.0.1] Hostname/ip-adress for the server
    --port PORT,                   [8080   ] Port number for the server
+  --public PATH,                 [examples/server/public] Path to the public folder
+  --request-path PATH,           [       ] Request path for all requests
+  --inference-path PATH,         [/inference] Inference path for all requests
    --convert,                     [false  ] Convert audio to WAV, requires ffmpeg on the server
+  -sns,      --suppress-nst      [false  ] suppress non-speech tokens
+  -nth N,    --no-speech-thold N [0.60   ] no speech threshold
+  -nc,       --no-context        [false  ] do not use previous audio context
+  -ng,       --no-gpu            [false  ] do not use gpu
+  -fa,       --flash-attn        [false  ] flash attention
+
+Voice Activity Detection (VAD) options:
+             --vad                           [false  ] enable Voice Activity Detection (VAD)
+  -vm FNAME, --vad-model FNAME               [       ] VAD model path
+  -vt N,     --vad-threshold N               [0.50   ] VAD threshold for speech recognition
+  -vspd N,   --vad-min-speech-duration-ms  N [250    ] VAD min speech duration (0.0-1.0)
+  -vsd N,    --vad-min-silence-duration-ms N [100    ] VAD min silence duration (to split segments)
+  -vmsd N,   --vad-max-speech-duration-s   N [FLT_MAX] VAD max speech duration (auto-split longer)
+  -vp N,     --vad-speech-pad-ms           N [30     ] VAD speech padding (extend segments)
+  -vo N,     --vad-samples-overlap         N [0.10   ] VAD samples overlap (seconds between segments)
  ```
  
  > [!WARNING]
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 8d99ebeeaf35ac2299d4ffe0e31d7db04200728a..df5088396927698ea6e8ede9b29e93f1c78dfe65 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -5,6 +5,7 @@
  #include "httplib.h"
  #include "json.hpp"
  
+#include <cfloat>
  #include <chrono>
  #include <cmath>
  #include <cstdio>
@@ -90,6 +91,16 @@ struct whisper_params {
      std::string openvino_encode_device = "CPU";
  
      std::string dtw = "";
+
+    // Voice Activity Detection (VAD) parameters
+    bool        vad           = false;
+    std::string vad_model     = "";
+    float       vad_threshold = 0.5f;
+    int         vad_min_speech_duration_ms = 250;
+    int         vad_min_silence_duration_ms = 100;
+    float       vad_max_speech_duration_s = FLT_MAX;
+    int         vad_speech_pad_ms = 30;
+    float       vad_samples_overlap = 0.1f;
  };
  
  void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
@@ -140,6 +151,18 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
      fprintf(stderr, "  -nc,       --no-context        [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
      fprintf(stderr, "  -ng,       --no-gpu            [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
      fprintf(stderr, "  -fa,       --flash-attn        [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
+    // Voice Activity Detection (VAD) parameters
+    fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
+    fprintf(stderr, "             --vad                           [%-7s] enable Voice Activity Detection (VAD)\n",            params.vad ? "true" : "false");
+    fprintf(stderr, "  -vm FNAME, --vad-model FNAME               [%-7s] VAD model path\n",                                   params.vad_model.c_str());
+    fprintf(stderr, "  -vt N,     --vad-threshold N               [%-7.2f] VAD threshold for speech recognition\n",           params.vad_threshold);
+    fprintf(stderr, "  -vspd N,   --vad-min-speech-duration-ms  N [%-7d] VAD min speech duration (0.0-1.0)\n",                params.vad_min_speech_duration_ms);
+    fprintf(stderr, "  -vsd N,    --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n",      params.vad_min_silence_duration_ms);
+    fprintf(stderr, "  -vmsd N,   --vad-max-speech-duration-s   N [%-7s] VAD max speech duration (auto-split longer)\n",      params.vad_max_speech_duration_s == FLT_MAX ?
+                                                                                                                                  std::string("FLT_MAX").c_str() :
+                                                                                                                                  std::to_string(params.vad_max_speech_duration_s).c_str());
+    fprintf(stderr, "  -vp N,     --vad-speech-pad-ms           N [%-7d] VAD speech padding (extend segments)\n",             params.vad_speech_pad_ms);
+    fprintf(stderr, "  -vo N,     --vad-samples-overlap         N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
      fprintf(stderr, "\n");
  }
  
@@ -195,6 +218,16 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
          else if (                  arg == "--request-path")    { sparams.request_path = argv[++i]; }
          else if (                  arg == "--inference-path")  { sparams.inference_path = argv[++i]; }
          else if (                  arg == "--convert")         { sparams.ffmpeg_converter     = true; }
+
+        // Voice Activity Detection (VAD)
+        else if (                  arg == "--vad")                         { params.vad                         = true; }
+        else if (arg == "-vm"   || arg == "--vad-model")                   { params.vad_model                   = argv[++i]; }
+        else if (arg == "-vt"   || arg == "--vad-threshold")               { params.vad_threshold               = std::stof(argv[++i]); }
+        else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms")  { params.vad_min_speech_duration_ms  = std::stoi(argv[++i]); }
+        else if (arg == "-vsd"  || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms  = std::stoi(argv[++i]); }
+        else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s")   { params.vad_max_speech_duration_s   = std::stof(argv[++i]); }
+        else if (arg == "-vp"   || arg == "--vad-speech-pad-ms")           { params.vad_speech_pad_ms           = std::stoi(argv[++i]); }
+        else if (arg == "-vo"   || arg == "--vad-samples-overlap")         { params.vad_samples_overlap         = std::stof(argv[++i]); }
          else {
              fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
              whisper_print_usage(argc, argv, params, sparams);
@@ -511,6 +544,34 @@ void get_req_parameters(const Request & req, whisper_params & params)
      {
          params.no_context = parse_str_to_bool(req.get_file_value("no_context").content);
      }
+    if (req.has_file("vad"))
+    {
+        params.vad = parse_str_to_bool(req.get_file_value("vad").content);
+    }
+    if (req.has_file("vad_threshold"))
+    {
+        params.vad_threshold = std::stof(req.get_file_value("vad_threshold").content);
+    }
+    if (req.has_file("vad_min_speech_duration_ms"))
+    {
+        params.vad_min_speech_duration_ms = std::stof(req.get_file_value("vad_min_speech_duration_ms").content);
+    }
+    if (req.has_file("vad_min_silence_duration_ms"))
+    {
+        params.vad_min_silence_duration_ms = std::stof(req.get_file_value("vad_min_silence_duration_ms").content);
+    }
+    if (req.has_file("vad_max_speech_duration_s"))
+    {
+        params.vad_max_speech_duration_s = std::stof(req.get_file_value("vad_max_speech_duration_s").content);
+    }
+    if (req.has_file("vad_speech_pad_ms"))
+    {
+        params.vad_speech_pad_ms = std::stoi(req.get_file_value("vad_speech_pad_ms").content);
+    }
+    if (req.has_file("vad_samples_overlap"))
+    {
+        params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content);
+    }
  }
  
  }  // namespace
@@ -829,6 +890,16 @@ int main(int argc, char ** argv) {
  
              wparams.suppress_nst     = params.suppress_nst;
  
+            wparams.vad              = params.vad;
+            wparams.vad_model_path   = params.vad_model.c_str();
+
+            wparams.vad_params.threshold               = params.vad_threshold;
+            wparams.vad_params.min_speech_duration_ms  = params.vad_min_speech_duration_ms;
+            wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
+            wparams.vad_params.max_speech_duration_s   = params.vad_max_speech_duration_s;
+            wparams.vad_params.speech_pad_ms           = params.vad_speech_pad_ms;
+            wparams.vad_params.samples_overlap         = params.vad_samples_overlap;
+
              whisper_print_user_data user_data = { &params, &pcmf32s, 0 };
  
              // this callback is called on each new segment
author	Daniel Bevenius <redacted>
	Fri, 13 Jun 2025 11:24:03 +0000 (13:24 +0200)
committer	GitHub <redacted>
	Fri, 13 Jun 2025 11:24:03 +0000 (13:24 +0200)
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history