-# addon
+# whisper.cpp Node.js addon
This is an addon demo that can **perform whisper model reasoning in `node` and `electron` environments**, based on [cmake-js](https://github.com/cmake-js/cmake-js).
It can be used as a reference for using the whisper.cpp project in other node projects.
+This addon now supports **Voice Activity Detection (VAD)** for improved transcription performance.
+
## Install
```shell
## Run
+### Basic Usage
+
```shell
cd examples/addon.node
node index.js --language='language' --model='model-path' --fname_inp='file-path'
```
-Because this is a simple Demo, only the above parameters are set in the node environment.
+### VAD (Voice Activity Detection) Usage
+
+Run the VAD example with performance comparison:
+
+```shell
+node vad-example.js
+```
+
+## Voice Activity Detection (VAD) Support
+
+VAD can significantly improve transcription performance by only processing speech segments, which is especially beneficial for audio files with long periods of silence.
+
+### VAD Model Setup
+
+Before using VAD, download a VAD model:
+
+```shell
+# From the whisper.cpp root directory
+./models/download-vad-model.sh silero-v5.1.2
+```
+
+### VAD Parameters
+
+All VAD parameters are optional and have sensible defaults:
+
+- `vad`: Enable VAD (default: false)
+- `vad_model`: Path to VAD model file (required when VAD enabled)
+- `vad_threshold`: Speech detection threshold 0.0-1.0 (default: 0.5)
+- `vad_min_speech_duration_ms`: Min speech duration in ms (default: 250)
+- `vad_min_silence_duration_ms`: Min silence duration in ms (default: 100)
+- `vad_max_speech_duration_s`: Max speech duration in seconds (default: FLT_MAX)
+- `vad_speech_pad_ms`: Speech padding in ms (default: 30)
+- `vad_samples_overlap`: Sample overlap 0.0-1.0 (default: 0.1)
+
+### JavaScript API Example
+
+```javascript
+const path = require("path");
+const { whisper } = require(path.join(__dirname, "../../build/Release/addon.node"));
+const { promisify } = require("util");
+
+const whisperAsync = promisify(whisper);
+
+// With VAD enabled
+const vadParams = {
+ language: "en",
+ model: path.join(__dirname, "../../models/ggml-base.en.bin"),
+ fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
+ vad: true,
+ vad_model: path.join(__dirname, "../../models/ggml-silero-v5.1.2.bin"),
+ vad_threshold: 0.5,
+ progress_callback: (progress) => console.log(`Progress: ${progress}%`)
+};
+
+whisperAsync(vadParams).then(result => console.log(result));
+```
+
+## Supported Parameters
+
+Both traditional whisper.cpp parameters and new VAD parameters are supported:
-Other parameters can also be specified in the node environment.
+- `language`: Language code (e.g., "en", "es", "fr")
+- `model`: Path to whisper model file
+- `fname_inp`: Path to input audio file
+- `use_gpu`: Enable GPU acceleration (default: true)
+- `flash_attn`: Enable flash attention (default: false)
+- `no_prints`: Disable console output (default: false)
+- `no_timestamps`: Disable timestamps (default: false)
+- `detect_language`: Auto-detect language (default: false)
+- `audio_ctx`: Audio context size (default: 0)
+- `max_len`: Maximum segment length (default: 0)
+- `max_context`: Maximum context size (default: -1)
+- `prompt`: Initial prompt for decoder
+- `comma_in_time`: Use comma in timestamps (default: true)
+- `print_progress`: Print progress info (default: false)
+- `progress_callback`: Progress callback function
+- VAD parameters (see above section)
-const path = require("path");
-const { whisper } = require(path.join(
- __dirname,
- "../../../build/Release/addon.node"
-));
-const { promisify } = require("util");
+const { join } = require('path');
+const { whisper } = require('../../../build/Release/addon.node');
+const { promisify } = require('util');
const whisperAsync = promisify(whisper);
-const whisperParamsMock = {
- language: "en",
- model: path.join(__dirname, "../../../models/ggml-base.en.bin"),
- fname_inp: path.join(__dirname, "../../../samples/jfk.wav"),
+const commonParams = {
+ language: 'en',
+ model: join(__dirname, '../../../models/ggml-base.en.bin'),
+ fname_inp: join(__dirname, '../../../samples/jfk.wav'),
use_gpu: true,
flash_attn: false,
no_prints: true,
- comma_in_time: false,
- translate: true,
no_timestamps: false,
detect_language: false,
audio_ctx: 0,
- max_len: 0,
- prompt: "",
- print_progress: false,
- progress_callback: (progress) => {
- console.log(`Progress: ${progress}`);
- },
- max_context: -1
+ max_len: 0
};
-describe("Run whisper.node", () => {
- test("it should receive a non-empty value", async () => {
- let result = await whisperAsync(whisperParamsMock);
- console.log(result);
+describe('Whisper.cpp Node.js addon with VAD support', () => {
+ test('Basic whisper transcription without VAD', async () => {
+ const params = {
+ ...commonParams,
+ vad: false
+ };
- expect(result['transcription'].length).toBeGreaterThan(0);
- }, 10000);
+ const result = await whisperAsync(params);
+
+ expect(typeof result).toBe('object');
+ expect(Array.isArray(result.transcription)).toBe(true);
+ expect(result.transcription.length).toBeGreaterThan(0);
+
+ // Check that we got some transcription text
+ const text = result.transcription.map(segment => segment[2]).join(' ');
+ expect(text.length).toBeGreaterThan(0);
+ expect(text.toLowerCase()).toContain('ask not');
+ }, 30000);
+
+ test('VAD parameters validation', async () => {
+ // Test with invalid VAD model - should return empty transcription
+ const invalidParams = {
+ ...commonParams,
+ vad: true,
+ vad_model: 'non-existent-model.bin',
+ vad_threshold: 0.5
+ };
+
+ // This should handle the error gracefully and return empty transcription
+ const result = await whisperAsync(invalidParams);
+ expect(typeof result).toBe('object');
+ expect(Array.isArray(result.transcription)).toBe(true);
+ // When VAD model doesn't exist, it should return empty transcription
+ expect(result.transcription.length).toBe(0);
+ }, 10000);
+
+ test('VAD parameter parsing', async () => {
+ // Test that VAD parameters are properly parsed (even if VAD model doesn't exist)
+ const vadParams = {
+ ...commonParams,
+ vad: false, // Disabled so no model required
+ vad_threshold: 0.7,
+ vad_min_speech_duration_ms: 300,
+ vad_min_silence_duration_ms: 150,
+ vad_max_speech_duration_s: 45.0,
+ vad_speech_pad_ms: 50,
+ vad_samples_overlap: 0.15
+ };
+
+ const result = await whisperAsync(vadParams);
+
+ expect(typeof result).toBe('object');
+ expect(Array.isArray(result.transcription)).toBe(true);
+ }, 30000);
+
+ test('Progress callback with VAD disabled', async () => {
+ let progressCalled = false;
+ let lastProgress = 0;
+
+ const params = {
+ ...commonParams,
+ vad: false,
+ progress_callback: (progress) => {
+ progressCalled = true;
+ lastProgress = progress;
+ expect(progress).toBeGreaterThanOrEqual(0);
+ expect(progress).toBeLessThanOrEqual(100);
+ }
+ };
+
+ const result = await whisperAsync(params);
+
+ expect(progressCalled).toBe(true);
+ expect(lastProgress).toBe(100);
+ expect(typeof result).toBe('object');
+ }, 30000);
+
+ test('Language detection without VAD', async () => {
+ const params = {
+ ...commonParams,
+ vad: false,
+ detect_language: true,
+ language: 'auto'
+ };
+
+ const result = await whisperAsync(params);
+
+ expect(typeof result).toBe('object');
+ expect(typeof result.language).toBe('string');
+ expect(result.language.length).toBeGreaterThan(0);
+ }, 30000);
+
+ test('Basic transcription with all VAD parameters set', async () => {
+ // Test with VAD disabled but all parameters set to ensure no crashes
+ const params = {
+ ...commonParams,
+ vad: false, // Disabled so it works without VAD model
+ vad_model: '', // Empty model path
+ vad_threshold: 0.6,
+ vad_min_speech_duration_ms: 200,
+ vad_min_silence_duration_ms: 80,
+ vad_max_speech_duration_s: 25.0,
+ vad_speech_pad_ms: 40,
+ vad_samples_overlap: 0.08
+ };
+
+ const result = await whisperAsync(params);
+
+ expect(typeof result).toBe('object');
+ expect(Array.isArray(result.transcription)).toBe(true);
+ expect(result.transcription.length).toBeGreaterThan(0);
+ }, 30000);
});
#include <vector>
#include <cmath>
#include <cstdint>
+#include <cfloat>
struct whisper_params {
int32_t n_threads = std::min(4, (int32_t) std::thread::hardware_concurrency());
std::vector<std::string> fname_out = {};
std::vector<float> pcmf32 = {}; // mono-channel F32 PCM
+
+ // Voice Activity Detection (VAD) parameters
+ bool vad = false;
+ std::string vad_model = "";
+ float vad_threshold = 0.5f;
+ int vad_min_speech_duration_ms = 250;
+ int vad_min_silence_duration_ms = 100;
+ float vad_max_speech_duration_s = FLT_MAX;
+ int vad_speech_pad_ms = 30;
+ float vad_samples_overlap = 0.1f;
};
struct whisper_print_user_data {
};
wparams.progress_callback_user_data = this;
- // Abort mechanism example
- {
- static bool is_aborted = false; // Note: this should be atomic to avoid data races
+ // Set VAD parameters
+ wparams.vad = params.vad;
+ wparams.vad_model_path = params.vad_model.c_str();
- wparams.encoder_begin_callback = [](struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, void * user_data) {
- bool is_aborted = *(bool*)user_data;
- return !is_aborted;
- };
- wparams.encoder_begin_callback_user_data = &is_aborted;
- }
+ wparams.vad_params.threshold = params.vad_threshold;
+ wparams.vad_params.min_speech_duration_ms = params.vad_min_speech_duration_ms;
+ wparams.vad_params.min_silence_duration_ms = params.vad_min_silence_duration_ms;
+ wparams.vad_params.max_speech_duration_s = params.vad_max_speech_duration_s;
+ wparams.vad_params.speech_pad_ms = params.vad_speech_pad_ms;
+ wparams.vad_params.samples_overlap = params.vad_samples_overlap;
if (whisper_full_parallel(ctx, wparams, pcmf32.data(), pcmf32.size(), params.n_processors) != 0) {
fprintf(stderr, "failed to process audio\n");
std::string language = whisper_params.Get("language").As<Napi::String>();
std::string model = whisper_params.Get("model").As<Napi::String>();
std::string input = whisper_params.Get("fname_inp").As<Napi::String>();
- bool use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
- bool flash_attn = whisper_params.Get("flash_attn").As<Napi::Boolean>();
- bool no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
- bool no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
- bool detect_language = whisper_params.Get("detect_language").As<Napi::Boolean>();
- int32_t audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
- bool comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
- int32_t max_len = whisper_params.Get("max_len").As<Napi::Number>();
+
+ bool use_gpu = true;
+ if (whisper_params.Has("use_gpu") && whisper_params.Get("use_gpu").IsBoolean()) {
+ use_gpu = whisper_params.Get("use_gpu").As<Napi::Boolean>();
+ }
+
+ bool flash_attn = false;
+ if (whisper_params.Has("flash_attn") && whisper_params.Get("flash_attn").IsBoolean()) {
+ flash_attn = whisper_params.Get("flash_attn").As<Napi::Boolean>();
+ }
+
+ bool no_prints = false;
+ if (whisper_params.Has("no_prints") && whisper_params.Get("no_prints").IsBoolean()) {
+ no_prints = whisper_params.Get("no_prints").As<Napi::Boolean>();
+ }
+
+ bool no_timestamps = false;
+ if (whisper_params.Has("no_timestamps") && whisper_params.Get("no_timestamps").IsBoolean()) {
+ no_timestamps = whisper_params.Get("no_timestamps").As<Napi::Boolean>();
+ }
+
+ bool detect_language = false;
+ if (whisper_params.Has("detect_language") && whisper_params.Get("detect_language").IsBoolean()) {
+ detect_language = whisper_params.Get("detect_language").As<Napi::Boolean>();
+ }
+
+ int32_t audio_ctx = 0;
+ if (whisper_params.Has("audio_ctx") && whisper_params.Get("audio_ctx").IsNumber()) {
+ audio_ctx = whisper_params.Get("audio_ctx").As<Napi::Number>();
+ }
+
+ bool comma_in_time = true;
+ if (whisper_params.Has("comma_in_time") && whisper_params.Get("comma_in_time").IsBoolean()) {
+ comma_in_time = whisper_params.Get("comma_in_time").As<Napi::Boolean>();
+ }
+
+ int32_t max_len = 0;
+ if (whisper_params.Has("max_len") && whisper_params.Get("max_len").IsNumber()) {
+ max_len = whisper_params.Get("max_len").As<Napi::Number>();
+ }
// Add support for max_context
int32_t max_context = -1;
// Add support for print_progress
bool print_progress = false;
- if (whisper_params.Has("print_progress")) {
+ if (whisper_params.Has("print_progress") && whisper_params.Get("print_progress").IsBoolean()) {
print_progress = whisper_params.Get("print_progress").As<Napi::Boolean>();
}
// Add support for progress_callback
progress_callback = whisper_params.Get("progress_callback").As<Napi::Function>();
}
+ // Add support for VAD parameters
+ bool vad = false;
+ if (whisper_params.Has("vad") && whisper_params.Get("vad").IsBoolean()) {
+ vad = whisper_params.Get("vad").As<Napi::Boolean>();
+ }
+
+ std::string vad_model = "";
+ if (whisper_params.Has("vad_model") && whisper_params.Get("vad_model").IsString()) {
+ vad_model = whisper_params.Get("vad_model").As<Napi::String>();
+ }
+
+ float vad_threshold = 0.5f;
+ if (whisper_params.Has("vad_threshold") && whisper_params.Get("vad_threshold").IsNumber()) {
+ vad_threshold = whisper_params.Get("vad_threshold").As<Napi::Number>();
+ }
+
+ int vad_min_speech_duration_ms = 250;
+ if (whisper_params.Has("vad_min_speech_duration_ms") && whisper_params.Get("vad_min_speech_duration_ms").IsNumber()) {
+ vad_min_speech_duration_ms = whisper_params.Get("vad_min_speech_duration_ms").As<Napi::Number>();
+ }
+
+ int vad_min_silence_duration_ms = 100;
+ if (whisper_params.Has("vad_min_silence_duration_ms") && whisper_params.Get("vad_min_silence_duration_ms").IsNumber()) {
+ vad_min_silence_duration_ms = whisper_params.Get("vad_min_silence_duration_ms").As<Napi::Number>();
+ }
+
+ float vad_max_speech_duration_s = FLT_MAX;
+ if (whisper_params.Has("vad_max_speech_duration_s") && whisper_params.Get("vad_max_speech_duration_s").IsNumber()) {
+ vad_max_speech_duration_s = whisper_params.Get("vad_max_speech_duration_s").As<Napi::Number>();
+ }
+
+ int vad_speech_pad_ms = 30;
+ if (whisper_params.Has("vad_speech_pad_ms") && whisper_params.Get("vad_speech_pad_ms").IsNumber()) {
+ vad_speech_pad_ms = whisper_params.Get("vad_speech_pad_ms").As<Napi::Number>();
+ }
+
+ float vad_samples_overlap = 0.1f;
+ if (whisper_params.Has("vad_samples_overlap") && whisper_params.Get("vad_samples_overlap").IsNumber()) {
+ vad_samples_overlap = whisper_params.Get("vad_samples_overlap").As<Napi::Number>();
+ }
+
Napi::Value pcmf32Value = whisper_params.Get("pcmf32");
std::vector<float> pcmf32_vec;
if (pcmf32Value.IsTypedArray()) {
params.prompt = prompt;
params.detect_language = detect_language;
+ // Set VAD parameters
+ params.vad = vad;
+ params.vad_model = vad_model;
+ params.vad_threshold = vad_threshold;
+ params.vad_min_speech_duration_ms = vad_min_speech_duration_ms;
+ params.vad_min_silence_duration_ms = vad_min_silence_duration_ms;
+ params.vad_max_speech_duration_s = vad_max_speech_duration_s;
+ params.vad_speech_pad_ms = vad_speech_pad_ms;
+ params.vad_samples_overlap = vad_samples_overlap;
+
Napi::Function callback = info[1].As<Napi::Function>();
// Create a new Worker class with progress callback support
ProgressWorker* worker = new ProgressWorker(callback, params, progress_callback, env);
--- /dev/null
+const path = require("path");
+const { whisper } = require(path.join(
+ __dirname,
+ "../../build/Release/addon.node"
+));
+const { promisify } = require("util");
+
+const whisperAsync = promisify(whisper);
+
+// Example with VAD enabled
+const vadParams = {
+ language: "en",
+ model: path.join(__dirname, "../../models/ggml-base.en.bin"),
+ fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
+ use_gpu: true,
+ flash_attn: false,
+ no_prints: false,
+ comma_in_time: true,
+ translate: false,
+ no_timestamps: false,
+ detect_language: false,
+ audio_ctx: 0,
+ max_len: 0,
+ // VAD parameters
+ vad: true,
+ vad_model: path.join(__dirname, "../../models/ggml-silero-v5.1.2.bin"), // You need to download this model
+ vad_threshold: 0.5,
+ vad_min_speech_duration_ms: 250,
+ vad_min_silence_duration_ms: 100,
+ vad_max_speech_duration_s: 30.0,
+ vad_speech_pad_ms: 30,
+ vad_samples_overlap: 0.1,
+ progress_callback: (progress) => {
+ console.log(`VAD Transcription progress: ${progress}%`);
+ }
+};
+
+// Example without VAD (traditional approach)
+const traditionalParams = {
+ language: "en",
+ model: path.join(__dirname, "../../models/ggml-base.en.bin"),
+ fname_inp: path.join(__dirname, "../../samples/jfk.wav"),
+ use_gpu: true,
+ flash_attn: false,
+ no_prints: false,
+ comma_in_time: true,
+ translate: false,
+ no_timestamps: false,
+ detect_language: false,
+ audio_ctx: 0,
+ max_len: 0,
+ vad: false, // Explicitly disable VAD
+ progress_callback: (progress) => {
+ console.log(`Traditional transcription progress: ${progress}%`);
+ }
+};
+
+async function runVADExample() {
+ try {
+ console.log("=== Whisper.cpp Node.js VAD Example ===\n");
+
+ // Check if VAD model exists
+ const fs = require('fs');
+ if (!fs.existsSync(vadParams.vad_model)) {
+ console.log("ā ļø VAD model not found. Please download the VAD model first:");
+ console.log(" ./models/download-vad-model.sh silero-v5.1.2");
+ console.log(" Or run: python models/convert-silero-vad-to-ggml.py");
+ console.log("\n Falling back to traditional transcription without VAD...\n");
+
+ // Run without VAD
+ console.log("šµ Running traditional transcription...");
+ const traditionalResult = await whisperAsync(traditionalParams);
+ console.log("\nš Traditional transcription result:");
+ console.log(traditionalResult);
+ return;
+ }
+
+ console.log("šµ Running transcription with VAD enabled...");
+ console.log("VAD Parameters:");
+ console.log(` - Threshold: ${vadParams.vad_threshold}`);
+ console.log(` - Min speech duration: ${vadParams.vad_min_speech_duration_ms}ms`);
+ console.log(` - Min silence duration: ${vadParams.vad_min_silence_duration_ms}ms`);
+ console.log(` - Max speech duration: ${vadParams.vad_max_speech_duration_s}s`);
+ console.log(` - Speech padding: ${vadParams.vad_speech_pad_ms}ms`);
+ console.log(` - Samples overlap: ${vadParams.vad_samples_overlap}\n`);
+
+ const startTime = Date.now();
+ const vadResult = await whisperAsync(vadParams);
+ const vadDuration = Date.now() - startTime;
+
+ console.log("\nā
VAD transcription completed!");
+ console.log(`ā±ļø Processing time: ${vadDuration}ms`);
+ console.log("\nš VAD transcription result:");
+ console.log(vadResult);
+
+ // Compare with traditional approach
+ console.log("\nš Running traditional transcription for comparison...");
+ const traditionalStartTime = Date.now();
+ const traditionalResult = await whisperAsync(traditionalParams);
+ const traditionalDuration = Date.now() - traditionalStartTime;
+
+ console.log("\nā
Traditional transcription completed!");
+ console.log(`ā±ļø Processing time: ${traditionalDuration}ms`);
+ console.log("\nš Traditional transcription result:");
+ console.log(traditionalResult);
+
+ // Performance comparison
+ console.log("\nš Performance Comparison:");
+ console.log(`VAD: ${vadDuration}ms`);
+ console.log(`Traditional: ${traditionalDuration}ms`);
+ const speedup = traditionalDuration / vadDuration;
+ if (speedup > 1) {
+ console.log(`š VAD is ${speedup.toFixed(2)}x faster!`);
+ } else {
+ console.log(`ā¹ļø Traditional approach was ${(1/speedup).toFixed(2)}x faster in this case.`);
+ }
+
+ } catch (error) {
+ console.error("ā Error during transcription:", error);
+ }
+}
+
+// Run the example
+if (require.main === module) {
+ runVADExample();
+}
+
+module.exports = {
+ runVADExample,
+ vadParams,
+ traditionalParams
+};
\ No newline at end of file