-sow, --split-on-word [false ] split on word rather than on token
-bo N, --best-of N [2 ] number of best candidates to keep
-bs N, --beam-size N [-1 ] beam size for beam search
+ -ac N, --audio-ctx N [0 ] audio context size (0 - all)
-wt N, --word-thold N [0.01 ] word timestamp probability threshold
-et N, --entropy-thold N [2.40 ] entropy threshold for decoder fail
-lpt N, --logprob-thold N [-1.00 ] log probability threshold for decoder fail
--prompt PROMPT [ ] initial prompt
-m FNAME, --model FNAME [models/ggml-base.en.bin] model path
-oved D, --ov-e-device DNAME [CPU ] the OpenVINO device used for encode inference
+ -dtw MODEL --dtw MODEL [ ] compute token-level timestamps
--host HOST, [127.0.0.1] Hostname/ip-adress for the server
--port PORT, [8080 ] Port number for the server
+ --public PATH, [examples/server/public] Path to the public folder
+ --request-path PATH, [ ] Request path for all requests
+ --inference-path PATH, [/inference] Inference path for all requests
--convert, [false ] Convert audio to WAV, requires ffmpeg on the server
+ -sns, --suppress-nst [false ] suppress non-speech tokens
+ -nth N, --no-speech-thold N [0.60 ] no speech threshold
+ -nc, --no-context [false ] do not use previous audio context
+ -ng, --no-gpu [false ] do not use gpu
+ -fa, --flash-attn [false ] flash attention
+
+Voice Activity Detection (VAD) options:
+ --vad [false ] enable Voice Activity Detection (VAD)
+ -vm FNAME, --vad-model FNAME [ ] VAD model path
+ -vt N, --vad-threshold N [0.50 ] VAD threshold for speech recognition
+ -vspd N, --vad-min-speech-duration-ms N [250 ] VAD min speech duration (0.0-1.0)
+ -vsd N, --vad-min-silence-duration-ms N [100 ] VAD min silence duration (to split segments)
+ -vmsd N, --vad-max-speech-duration-s N [FLT_MAX] VAD max speech duration (auto-split longer)
+ -vp N, --vad-speech-pad-ms N [30 ] VAD speech padding (extend segments)
+ -vo N, --vad-samples-overlap N [0.10 ] VAD samples overlap (seconds between segments)
```
> [!WARNING]
#include "httplib.h"
#include "json.hpp"
+#include <cfloat>
#include <chrono>
#include <cmath>
#include <cstdio>
std::string openvino_encode_device = "CPU";
std::string dtw = "";
+
+ // Voice Activity Detection (VAD) parameters
+ bool vad = false;
+ std::string vad_model = "";
+ float vad_threshold = 0.5f;
+ int vad_min_speech_duration_ms = 250;
+ int vad_min_silence_duration_ms = 100;
+ float vad_max_speech_duration_s = FLT_MAX;
+ int vad_speech_pad_ms = 30;
+ float vad_samples_overlap = 0.1f;
};
void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & params, const server_params& sparams) {
fprintf(stderr, " -nc, --no-context [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
+ // Voice Activity Detection (VAD) parameters
+ fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
+ fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
+ fprintf(stderr, " -vm FNAME, --vad-model FNAME [%-7s] VAD model path\n", params.vad_model.c_str());
+ fprintf(stderr, " -vt N, --vad-threshold N [%-7.2f] VAD threshold for speech recognition\n", params.vad_threshold);
+ fprintf(stderr, " -vspd N, --vad-min-speech-duration-ms N [%-7d] VAD min speech duration (0.0-1.0)\n", params.vad_min_speech_duration_ms);
+ fprintf(stderr, " -vsd N, --vad-min-silence-duration-ms N [%-7d] VAD min silence duration (to split segments)\n", params.vad_min_silence_duration_ms);
+ fprintf(stderr, " -vmsd N, --vad-max-speech-duration-s N [%-7s] VAD max speech duration (auto-split longer)\n", params.vad_max_speech_duration_s == FLT_MAX ?
+ std::string("FLT_MAX").c_str() :
+ std::to_string(params.vad_max_speech_duration_s).c_str());
+ fprintf(stderr, " -vp N, --vad-speech-pad-ms N [%-7d] VAD speech padding (extend segments)\n", params.vad_speech_pad_ms);
+ fprintf(stderr, " -vo N, --vad-samples-overlap N [%-7.2f] VAD samples overlap (seconds between segments)\n", params.vad_samples_overlap);
fprintf(stderr, "\n");
}
else if ( arg == "--request-path") { sparams.request_path = argv[++i]; }
else if ( arg == "--inference-path") { sparams.inference_path = argv[++i]; }
else if ( arg == "--convert") { sparams.ffmpeg_converter = true; }
+
+ // Voice Activity Detection (VAD)
+ else if ( arg == "--vad") { params.vad = true; }
+ else if (arg == "-vm" || arg == "--vad-model") { params.vad_model = argv[++i]; }
+ else if (arg == "-vt" || arg == "--vad-threshold") { params.vad_threshold = std::stof(argv[++i]); }
+ else if (arg == "-vspd" || arg == "--vad-min-speech-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
+ else if (arg == "-vsd" || arg == "--vad-min-silence-duration-ms") { params.vad_min_speech_duration_ms = std::stoi(argv[++i]); }
+ else if (arg == "-vmsd" || arg == "--vad-max-speech-duration-s") { params.vad_max_speech_duration_s = std::stof(argv[++i]); }
+ else if (arg == "-vp" || arg == "--vad-speech-pad-ms") { params.vad_speech_pad_ms = std::stoi(argv[++i]); }
+ else if (arg == "-vo" || arg == "--vad-samples-overlap") { params.vad_samples_overlap = std::stof(argv[++i]); }
else {
fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
whisper_print_usage(argc, argv, params, sparams);
{
params.no_context = parse_str_to_bool(req.get_file_value("no_context").content);
}
+ if (req.has_file("vad"))
+ {
+ params.vad = parse_str_to_bool(req.get_file_value("vad").content);
+ }
+ if (req.has_file("vad_threshold"))
+ {
+ params.vad_threshold = std::stof(req.get_file_value("vad_threshold").content);
+ }
+ if (req.has_file("vad_min_speech_duration_ms"))
+ {
+ params.vad_min_speech_duration_ms = std::stof(req.get_file_value("vad_min_speech_duration_ms").content);
+ }
+ if (req.has_file("vad_min_silence_duration_ms"))
+ {
+ params.vad_min_silence_duration_ms = std::stof(req.get_file_value("vad_min_silence_duration_ms").content);
+ }
+ if (req.has_file("vad_max_speech_duration_s"))
+ {
+ params.vad_max_speech_duration_s = std::stof(req.get_file_value("vad_max_speech_duration_s").content);
+ }
+ if (req.has_file("vad_speech_pad_ms"))
+ {
+ params.vad_speech_pad_ms = std::stoi(req.get_file_value("vad_speech_pad_ms").content);
+ }
+ if (req.has_file("vad_samples_overlap"))
+ {
+ params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content);
+ }
}
} // namespace
wparams.suppress_nst = params.suppress_nst;
+ wparams.vad = params.vad;
+ wparams.vad_model_path = params.vad_model.c_str();
+
+ wparams.vad_params.threshold = params.vad_threshold;
+ wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
+ wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
+ wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
+ wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
+ wparams.vad_params.samples_overlap = params.vad_samples_overlap;
+
whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 };
// this callback is called on each new segment