The second argument `samples` may be an array, an object with `length` and `each` method, or a MemoryView. If you can prepare audio data as C array and export it as a MemoryView, whispercpp accepts and works with it with zero copy.
+Using VAD separately from ASR
+-----------------------------
+
+VAD feature itself is useful. You can use it separately from ASR:
+
+```ruby
+vad = Whisper::VAD::Context.new("silero-v5.1.2")
+vad
+ .detect("path/to/audio.wav", Whisper::VAD::Params.new)
+ .each_with_index do |segment, index|
+ segment => {start_time: st, end_time: ed} # `Segment` responds to `#deconstruct_keys`
+
+ puts "[%{nth}: %{st} --> %{ed}]" % {nth: index + 1, st:, ed:}
+ end
+```
+
Development
-----------
VALUE mVAD;
VALUE cContext;
VALUE cParams;
+VALUE cVADContext;
VALUE cVADParams;
+VALUE cVADSegments;
+VALUE cVADSegment;
VALUE eError;
VALUE cSegment;
extern void init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cSegment);
extern void init_ruby_whisper_model(VALUE *mWhisper);
extern void init_ruby_whisper_vad_params(VALUE *mVAD);
+extern void init_ruby_whisper_vad_context(VALUE *mVAD);
+extern void init_ruby_whisper_vad_segment(VALUE *mVAD);
+extern void init_ruby_whisper_vad_segments(VALUE *mVAD);
extern void register_callbacks(ruby_whisper_params *rwp, VALUE *context);
/*
init_ruby_whisper_segment(&mWhisper, &cContext);
init_ruby_whisper_model(&mWhisper);
init_ruby_whisper_vad_params(&mVAD);
+ init_ruby_whisper_vad_segment(&mVAD);
+ init_ruby_whisper_vad_segments(&mVAD);
+ init_ruby_whisper_vad_context(&mVAD);
rb_require("whisper/context");
rb_require("whisper/segment");
VALUE context;
} ruby_whisper_model;
+typedef struct {
+ struct whisper_vad_segments *segments;
+} ruby_whisper_vad_segments;
+
+typedef struct {
+ VALUE segments;
+ int index;
+} ruby_whisper_vad_segment;
+
+typedef struct {
+ struct whisper_vad_context *context;
+} ruby_whisper_vad_context;
+
#endif
if (!rws) {
return 0;
}
+ if (rws->index) {
+ size += sizeof(rws->index);
+ }
return size;
}
--- /dev/null
+#include <ruby.h>
+#include "ruby_whisper.h"
+
+extern ID id_to_s;
+
+extern VALUE cVADContext;
+
+extern VALUE ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params);
+extern VALUE ruby_whisper_normalize_model_path(VALUE model_path);
+
+static size_t
+ruby_whisper_vad_context_memsize(const void *p)
+{
+ const ruby_whisper_vad_context *rwvc = p;
+ size_t size = sizeof(rwvc);
+ if (!rwvc) {
+ return 0;
+ }
+ if (rwvc->context) {
+ size += sizeof(rwvc->context);
+ }
+ return size;
+}
+
+static void
+ruby_whisper_vad_context_free(void *p)
+{
+ ruby_whisper_vad_context *rwvc = (ruby_whisper_vad_context *)p;
+ if (rwvc->context) {
+ whisper_vad_free(rwvc->context);
+ rwvc->context = NULL;
+ }
+ xfree(rwvc);
+}
+
+const rb_data_type_t ruby_whisper_vad_context_type = {
+ "ruby_whisper_vad_context",
+ {0, ruby_whisper_vad_context_free, ruby_whisper_vad_context_memsize,},
+ 0, 0,
+ 0
+};
+
+static VALUE
+ruby_whisper_vad_context_s_allocate(VALUE klass)
+{
+ ruby_whisper_vad_context *rwvc;
+ VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_context, &ruby_whisper_vad_context_type, rwvc);
+ rwvc->context = NULL;
+ return obj;
+}
+
+static VALUE
+ruby_whisper_vad_context_initialize(VALUE self, VALUE model_path)
+{
+ ruby_whisper_vad_context *rwvc;
+ struct whisper_vad_context *context;
+
+ model_path = ruby_whisper_normalize_model_path(model_path);
+ context = whisper_vad_init_from_file_with_params(StringValueCStr(model_path), whisper_vad_default_context_params());
+ if (context == NULL) {
+ rb_raise(rb_eRuntimeError, "Failed to initialize whisper VAD context");
+ }
+ TypedData_Get_Struct(self, ruby_whisper_vad_context, &ruby_whisper_vad_context_type, rwvc);
+ rwvc->context = context;
+
+ return Qnil;
+}
+
+void init_ruby_whisper_vad_context(VALUE *mVAD)
+{
+ cVADContext = rb_define_class_under(*mVAD, "Context", rb_cObject);
+ rb_define_alloc_func(cVADContext, ruby_whisper_vad_context_s_allocate);
+ rb_define_method(cVADContext, "initialize", ruby_whisper_vad_context_initialize, 1);
+ rb_define_method(cVADContext, "detect", ruby_whisper_vad_detect, 2);
+}
--- /dev/null
+#include <ruby.h>
+#include "ruby_whisper.h"
+#include "common-whisper.h"
+#include <string>
+#include <vector>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+extern VALUE cVADSegments;
+
+extern const rb_data_type_t ruby_whisper_vad_context_type;
+extern const rb_data_type_t ruby_whisper_vad_params_type;
+extern const rb_data_type_t ruby_whisper_vad_segments_type;
+
+extern VALUE ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments);
+
+VALUE
+ruby_whisper_vad_detect(VALUE self, VALUE file_path, VALUE params) {
+ ruby_whisper_vad_context *rwvc;
+ ruby_whisper_vad_params *rwvp;
+ std::string cpp_file_path;
+ std::vector<float> pcmf32;
+ std::vector<std::vector<float>> pcmf32s;
+ whisper_vad_segments *segments;
+
+ TypedData_Get_Struct(self, ruby_whisper_vad_context, &ruby_whisper_vad_context_type, rwvc);
+ if (rwvc->context == NULL) {
+ rb_raise(rb_eRuntimeError, "Doesn't have referenxe to context internally");
+ }
+ TypedData_Get_Struct(params, ruby_whisper_vad_params, &ruby_whisper_vad_params_type, rwvp);
+
+ cpp_file_path = StringValueCStr(file_path);
+
+ if (!read_audio_data(cpp_file_path, pcmf32, pcmf32s, false)) {
+ rb_raise(rb_eRuntimeError, "Failed to open '%s' as WAV file\n", cpp_file_path.c_str());
+ }
+
+ segments = whisper_vad_segments_from_samples(rwvc->context, rwvp->params, pcmf32.data(), pcmf32.size());
+ if (segments == nullptr) {
+ rb_raise(rb_eRuntimeError, "Failed to process audio\n");
+ }
+
+ return ruby_whisper_vad_segments_s_init(segments);
+}
+
+#ifdef __cplusplus
+}
+#endif
--- /dev/null
+#include <ruby.h>
+#include "ruby_whisper.h"
+
+#define N_KEY_NAMES 2
+
+extern VALUE cVADSegment;
+
+extern const rb_data_type_t ruby_whisper_vad_segments_type;
+
+static VALUE sym_start_time;
+static VALUE sym_end_time;
+static VALUE key_names;
+
+static void
+rb_whisper_vad_segment_mark(void *p)
+{
+ ruby_whisper_vad_segment *rwvs = (ruby_whisper_vad_segment *)p;
+ rb_gc_mark(rwvs->segments);
+}
+
+static size_t
+ruby_whisper_vad_segment_memsize(const void *p)
+{
+ const ruby_whisper_vad_segment *rwvs = p;
+ size_t size = sizeof(rwvs);
+ if (!rwvs) {
+ return 0;
+ }
+ if (rwvs->index) {
+ size += sizeof(rwvs->index);
+ }
+ return size;
+}
+
+static const rb_data_type_t ruby_whisper_vad_segment_type = {
+ "ruby_whisper_vad_segment",
+ {rb_whisper_vad_segment_mark, RUBY_DEFAULT_FREE, ruby_whisper_vad_segment_memsize,},
+ 0, 0,
+ 0
+};
+
+static VALUE
+ruby_whisper_vad_segment_s_allocate(VALUE klass)
+{
+ ruby_whisper_vad_segment *rwvs;
+ VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
+ rwvs->segments = Qnil;
+ rwvs->index = -1;
+ return obj;
+}
+
+VALUE
+rb_whisper_vad_segment_s_new(VALUE segments, int index)
+{
+ ruby_whisper_vad_segment *rwvs;
+ const VALUE segment = ruby_whisper_vad_segment_s_allocate(cVADSegment);
+ TypedData_Get_Struct(segment, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
+ rwvs->segments = segments;
+ rwvs->index = index;
+ return segment;
+}
+
+static VALUE
+ruby_whisper_vad_segment_get_start_time(VALUE self)
+{
+ ruby_whisper_vad_segment *rwvs;
+ ruby_whisper_vad_segments *rwvss;
+ float t0;
+
+ TypedData_Get_Struct(self, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
+ TypedData_Get_Struct(rwvs->segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
+ t0 = whisper_vad_segments_get_segment_t0(rwvss->segments, rwvs->index);
+ return DBL2NUM(t0 * 10);
+}
+
+static VALUE
+ruby_whisper_vad_segment_get_end_time(VALUE self)
+{
+ ruby_whisper_vad_segment *rwvs;
+ ruby_whisper_vad_segments *rwvss;
+ float t1;
+
+ TypedData_Get_Struct(self, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
+ TypedData_Get_Struct(rwvs->segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
+ t1 = whisper_vad_segments_get_segment_t1(rwvss->segments, rwvs->index);
+ return DBL2NUM(t1 * 10);
+}
+
+static VALUE
+ruby_whisper_vad_segment_deconstruct_keys(VALUE self, VALUE keys)
+{
+ ruby_whisper_vad_segment *rwvs;
+ ruby_whisper_vad_segments *rwvss;
+ VALUE hash, key;
+ long n_keys;
+ int i;
+
+ TypedData_Get_Struct(self, ruby_whisper_vad_segment, &ruby_whisper_vad_segment_type, rwvs);
+ TypedData_Get_Struct(rwvs->segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
+
+ hash = rb_hash_new();
+ if (NIL_P(keys)) {
+ keys = key_names;
+ n_keys = N_KEY_NAMES;
+ } else {
+ n_keys = RARRAY_LEN(keys);
+ if (n_keys > N_KEY_NAMES) {
+ return hash;
+ }
+ }
+ for (i = 0; i < n_keys; i++) {
+ key = rb_ary_entry(keys, i);
+ if (key == sym_start_time) {
+ rb_hash_aset(hash, key, ruby_whisper_vad_segment_get_start_time(self));
+ }
+ if (key == sym_end_time) {
+ rb_hash_aset(hash, key, ruby_whisper_vad_segment_get_end_time(self));
+ }
+ }
+
+ return hash;
+}
+
+void
+init_ruby_whisper_vad_segment(VALUE *mVAD)
+{
+ cVADSegment = rb_define_class_under(*mVAD, "Segment", rb_cObject);
+
+ sym_start_time = ID2SYM(rb_intern("start_time"));
+ sym_end_time = ID2SYM(rb_intern("end_time"));
+ key_names = rb_ary_new3(
+ N_KEY_NAMES,
+ sym_start_time,
+ sym_end_time
+ );
+
+ rb_define_alloc_func(cVADSegment, ruby_whisper_vad_segment_s_allocate);
+ rb_define_method(cVADSegment, "start_time", ruby_whisper_vad_segment_get_start_time, 0);
+ rb_define_method(cVADSegment, "end_time", ruby_whisper_vad_segment_get_end_time, 0);
+ rb_define_method(cVADSegment, "deconstruct_keys", ruby_whisper_vad_segment_deconstruct_keys, 1);
+}
--- /dev/null
+#include <ruby.h>
+#include "ruby_whisper.h"
+
+extern ID id___method__;
+extern ID id_to_enum;
+
+extern VALUE cVADSegments;
+
+extern VALUE rb_whisper_vad_segment_s_new(VALUE segments, int index);
+
+static size_t
+ruby_whisper_vad_segments_memsize(const void *p)
+{
+ const ruby_whisper_vad_segments *rwvss = p;
+ size_t size = sizeof(rwvss);
+ if (!rwvss) {
+ return 0;
+ }
+ if (rwvss->segments) {
+ size += sizeof(rwvss->segments);
+ }
+ return size;
+}
+
+static void
+ruby_whisper_vad_segments_free(void *p)
+{
+ ruby_whisper_vad_segments *rwvss = (ruby_whisper_vad_segments *)p;
+ if (rwvss->segments) {
+ whisper_vad_free_segments(rwvss->segments);
+ rwvss->segments = NULL;
+ }
+ xfree(rwvss);
+}
+
+const rb_data_type_t ruby_whisper_vad_segments_type = {
+ "ruby_whisper_vad_segments",
+ {0, ruby_whisper_vad_segments_free, ruby_whisper_vad_segments_memsize,},
+ 0, 0,
+ 0
+};
+
+static VALUE
+ruby_whisper_vad_segments_s_allocate(VALUE klass)
+{
+ ruby_whisper_vad_segments *rwvss;
+ VALUE obj = TypedData_Make_Struct(klass, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
+ rwvss->segments = NULL;
+ return obj;
+}
+
+VALUE
+ruby_whisper_vad_segments_s_init(struct whisper_vad_segments *segments)
+{
+ VALUE rb_segments;
+ ruby_whisper_vad_segments *rwvss;
+
+ rb_segments = ruby_whisper_vad_segments_s_allocate(cVADSegments);
+ TypedData_Get_Struct(rb_segments, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
+ rwvss->segments = segments;
+
+ return rb_segments;
+}
+
+static VALUE
+ruby_whisper_vad_segments_each(VALUE self)
+{
+ ruby_whisper_vad_segments *rwvss;
+ VALUE method_name;
+ int n_segments, i;
+
+ if (!rb_block_given_p()) {
+ method_name = rb_funcall(self, id___method__, 0);
+ return rb_funcall(self, id_to_enum, 1, method_name);
+ }
+
+ TypedData_Get_Struct(self, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
+ if (rwvss->segments == NULL) {
+ rb_raise(rb_eRuntimeError, "Doesn't have reference to segments internally");
+ }
+ n_segments = whisper_vad_segments_n_segments(rwvss->segments);
+ for (i = 0; i < n_segments; ++i) {
+ rb_yield(rb_whisper_vad_segment_s_new(self, i));
+ }
+
+ return self;
+}
+
+static VALUE
+ruby_whisper_vad_segments_get_length(VALUE self)
+{
+ ruby_whisper_vad_segments *rwvss;
+ int n_segments;
+
+ TypedData_Get_Struct(self, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
+ if (rwvss->segments == NULL) {
+ rb_raise(rb_eRuntimeError, "Doesn't have reference to segments internally");
+ }
+ n_segments = whisper_vad_segments_n_segments(rwvss->segments);
+
+ return INT2NUM(n_segments);
+}
+
+void
+init_ruby_whisper_vad_segments(VALUE *mVAD)
+{
+ cVADSegments = rb_define_class_under(*mVAD, "Segments", rb_cObject);
+ rb_define_alloc_func(cVADSegments, ruby_whisper_vad_segments_s_allocate);
+ rb_define_method(cVADSegments, "each", ruby_whisper_vad_segments_each, 0);
+ rb_define_method(cVADSegments, "length", ruby_whisper_vad_segments_get_length, 0);
+ rb_include_module(cVADSegments, rb_path2class("Enumerable"));
+}
def samples_overlap: () -> Float
def ==: (Params) -> (true | false)
end
+
+ class Context
+ def self.new: (String | path | ::URI::HTTP model_name_or_path) -> instance
+ def detect: (path wav_file_path, Params) -> Segments
+ end
+
+ class Segments
+ include Enumerable[Segment]
+
+ def each: { (Segment) -> void } -> void
+ | () -> Enumerator[Segment]
+ def length: -> Integer
+ end
+
+ class Segment
+ type deconstructed_keys = {
+ start_time: (Integer | nil),
+ end_time: (Integer | nil),
+ }
+
+ def start_time: () -> Integer
+ def end_time: () -> Integer
+ def deconstruct_keys: (Array[:start_time | :end_time] | nil) -> deconstructed_keys
+ end
end
class Error < StandardError
--- /dev/null
+require_relative "helper"
+
+class TestVADContext < TestBase
+ def test_initialize
+ context = Whisper::VAD::Context.new("silero-v5.1.2")
+ assert_instance_of Whisper::VAD::Context, context
+ end
+
+ def test_detect
+ context = Whisper::VAD::Context.new("silero-v5.1.2")
+ segments = context.detect(AUDIO, Whisper::VAD::Params.new)
+ assert_instance_of Whisper::VAD::Segments, segments
+
+ i = 0
+ segments.each do |segment|
+ i += 1
+ assert_instance_of Whisper::VAD::Segment, segment
+ end
+ assert i > 0
+
+ segments.each_with_index do |segment, index|
+ assert_instance_of Integer, index
+ end
+
+ assert_instance_of Enumerator, segments.each
+
+ segment = segments.each.first
+ assert_instance_of Float, segment.start_time
+ assert_instance_of Float, segment.end_time
+
+ segment => {start_time:, end_time:}
+ assert_equal segment.start_time, start_time
+ assert_equal segment.end_time, end_time
+
+ assert_equal 5, segments.length
+ end
+
+ def test_invalid_model_type
+ assert_raise TypeError do
+ Whisper::VAD::Context.new(Object.new)
+ end
+ end
+
+ def test_allocate
+ vad = Whisper::VAD::Context.allocate
+ assert_raise do
+ vad.detect(AUDIO, Whisper::VAD::Params.new)
+ end
+ end
+end
--- /dev/null
+require_relative "helper"
+
+class TestVADSegment < TestBase
+ def test_initialize
+ segment = Whisper::VAD::Segment.new
+
+ assert_raise do
+ segment.start_time
+ end
+
+ assert_raise do
+ segments.end_time
+ end
+
+ assert_raise do
+ segment => {start_time:, end_time:}
+ end
+ end
+end
--- /dev/null
+require_relative "helper"
+
+class TestVADSegments < TestBase
+ def test_initialize
+ segments = Whisper::VAD::Segments.new
+
+ assert_raise do
+ segments.each do |segment|
+ end
+ end
+
+ assert_raise do
+ segments.length
+ end
+ end
+end