For details on VAD, see [whisper.cpp's README](https://github.com/ggml-org/whisper.cpp?tab=readme-ov-file#voice-activity-detection-vad).
+### Output ###
+
+whispercpp supports SRT and WebVTT output:
+
+```ruby
+puts whisper.transcribe("path/to/audio.wav", Whisper::Params.new).to_webvtt
+# =>
+WEBVTT
+
+1
+00:00:00.000 --> 00:00:03.860
+ My thought I have nobody by a beauty and will as you poured.
+
+2
+00:00:03.860 --> 00:00:09.840
+ Mr. Rochester is sub in that so-don't find simplest, and devoted about, to let might in
+
+3
+00:00:09.840 --> 00:00:09.940
+ a
+
+```
+
+You may call `#to_srt`, too
+
+
API
---
ed: format_time(segment.end_time),
text: segment.text
}
- line << " (speaker turned)" if segment.speaker_next_turn?
+ line << " (speaker turned)" if segment.speaker_turn_next?
puts line
end
ed: format_time(segment.end_time),
text: segment.text
}
- line << " (speaker turned)" if segment.speaker_next_turn?
+ line << " (speaker turned)" if segment.speaker_turn_next?
puts line
end
init_ruby_whisper_model(&mWhisper);
init_ruby_whisper_vad_params(&mVAD);
+ rb_require("whisper/context");
+ rb_require("whisper/segment");
rb_require("whisper/model/uri");
}
rb_define_method(cContext, "full", ruby_whisper_full, -1);
rb_define_method(cContext, "full_parallel", ruby_whisper_full_parallel, -1);
- // High leve
+ // High level
rb_define_method(cContext, "full_get_segment", ruby_whisper_full_get_segment, 1);
rb_define_method(cContext, "each_segment", ruby_whisper_each_segment, 0);
#include <ruby.h>
#include "ruby_whisper.h"
+#define N_KEY_NAMES 5
+
+static VALUE sym_start_time;
+static VALUE sym_end_time;
+static VALUE sym_text;
+static VALUE sym_no_speech_prob;
+static VALUE sym_speaker_turn_next;
+static VALUE key_names;
+
extern const rb_data_type_t ruby_whisper_type;
extern VALUE cSegment;
return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
}
+/*
+ * call-seq:
+ * deconstruct_keys(keys) -> hash
+ *
+ * Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
+ *
+ * whisper.each_segment do |segment|
+ * segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
+ *
+ * puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
+ * end
+ */
+static VALUE
+ruby_whisper_segment_deconstruct_keys(VALUE self, VALUE keys)
+{
+ ruby_whisper_segment *rws;
+ TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
+ ruby_whisper *rw;
+ TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+
+ VALUE hash = rb_hash_new();
+ long n_keys;
+ if (NIL_P(keys)) {
+ keys = key_names;
+ n_keys = N_KEY_NAMES;
+ } else {
+ n_keys = RARRAY_LEN(keys);
+ if (n_keys > N_KEY_NAMES) {
+ return hash;
+ }
+ }
+ for (int i = 0; i < n_keys; i++) {
+ VALUE key = rb_ary_entry(keys, i);
+ if (key == sym_start_time) {
+ rb_hash_aset(hash, key, ruby_whisper_segment_get_start_time(self));
+ }
+ if (key == sym_end_time) {
+ rb_hash_aset(hash, key, ruby_whisper_segment_get_end_time(self));
+ }
+ if (key == sym_text) {
+ rb_hash_aset(hash, key, ruby_whisper_segment_get_text(self));
+ }
+ if (key == sym_no_speech_prob) {
+ rb_hash_aset(hash, key, ruby_whisper_segment_get_no_speech_prob(self));
+ }
+ if (key == sym_speaker_turn_next) {
+ rb_hash_aset(hash, key, ruby_whisper_segment_get_speaker_turn_next(self));
+ }
+ }
+
+ return hash;
+}
+
void
init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
{
cSegment = rb_define_class_under(*mWhisper, "Segment", rb_cObject);
+ sym_start_time = ID2SYM(rb_intern("start_time"));
+ sym_end_time = ID2SYM(rb_intern("end_time"));
+ sym_text = ID2SYM(rb_intern("text"));
+ sym_no_speech_prob = ID2SYM(rb_intern("no_speech_prob"));
+ sym_speaker_turn_next = ID2SYM(rb_intern("speaker_turn_next"));
+ key_names = rb_ary_new3(
+ N_KEY_NAMES,
+ sym_start_time,
+ sym_end_time,
+ sym_text,
+ sym_no_speech_prob,
+ sym_speaker_turn_next
+ );
+
rb_define_alloc_func(cSegment, ruby_whisper_segment_allocate);
rb_define_method(cSegment, "start_time", ruby_whisper_segment_get_start_time, 0);
rb_define_method(cSegment, "end_time", ruby_whisper_segment_get_end_time, 0);
- rb_define_method(cSegment, "speaker_next_turn?", ruby_whisper_segment_get_speaker_turn_next, 0);
+ rb_define_method(cSegment, "speaker_turn_next?", ruby_whisper_segment_get_speaker_turn_next, 0);
rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
+ rb_define_method(cSegment, "deconstruct_keys", ruby_whisper_segment_deconstruct_keys, 1);
}
fprintf(stderr, "failed to process audio\n");
return self;
}
+ if (NIL_P(blk)) {
+ return self;
+ }
const int n_segments = whisper_full_n_segments(rw->context);
VALUE output = rb_str_new2("");
for (int i = 0; i < n_segments; ++i) {
const char * text = whisper_full_get_segment_text(rw->context, i);
output = rb_str_concat(output, rb_str_new2(text));
}
- if (blk != Qnil) {
- rb_funcall(blk, id_call, 1, output);
- }
+ rb_funcall(blk, id_call, 1, output);
return self;
}
#ifdef __cplusplus
--- /dev/null
+module Whisper
+ class Context
+ def to_srt
+ each_segment.with_index.reduce("") {|srt, (segment, index)|
+ srt << "#{index + 1}\n#{segment.to_srt_cue}\n"
+ }
+ end
+
+ def to_webvtt
+ each_segment.with_index.reduce("WEBVTT\n\n") {|webvtt, (segment, index)|
+ webvtt << "#{index + 1}\n#{segment.to_webvtt_cue}\n"
+ }
+ end
+ end
+end
--- /dev/null
+module Whisper
+ class Segment
+ SRT_ESCAPES = {
+ "&" => "&",
+ "<" => "<",
+ ">" => ">",
+ }
+ SRT_ESCAPES_RE = Regexp.union(SRT_ESCAPES.keys)
+ private_constant :SRT_ESCAPES, :SRT_ESCAPES_RE
+
+ def to_srt_cue
+ "#{srt_start_time} --> #{srt_end_time}\n#{srt_text}\n"
+ end
+
+ def to_webvtt_cue
+ "#{webvtt_start_time} --> #{webvtt_end_time}\n#{webvtt_text}\n"
+ end
+
+ private
+
+ def time_to_a(time)
+ sec, decimal_part = time.divmod(1000)
+ min, sec = sec.divmod(60)
+ hour, min = min.divmod(60)
+ [hour, min, sec, decimal_part]
+ end
+
+ def srt_time(time)
+ "%02d:%02d:%02d,%03d" % time_to_a(time)
+ end
+
+ def srt_start_time
+ srt_time(start_time)
+ end
+
+ def srt_end_time
+ srt_time(end_time)
+ end
+
+ def srt_text
+ text.gsub(SRT_ESCAPES_RE, SRT_ESCAPES)
+ end
+
+ def webvtt_time(time)
+ "%02d:%02d:%02d.%03d" % time_to_a(time)
+ end
+
+ def webvtt_start_time
+ webvtt_time(start_time)
+ end
+
+ def webvtt_end_time
+ webvtt_time(end_time)
+ end
+
+ alias webvtt_text srt_text
+ end
+end
def full_parallel: (Params, Array[Float], ?Integer n_samples) -> self
| (Params, _Samples, ?Integer n_samples) -> self
| (Params, _Samples, ?Integer? n_samples, Integer n_processors) -> self
+
+ def to_srt: () -> String
+ def to_webvtt: () -> String
end
class Params
end
class Segment
+ type deconstructed_keys = {
+ start_time: (Integer | nil),
+ end_time: (Integer | nil),
+ text: (String | nil),
+ no_speech_prob: (Float | nil),
+ speaker_turn_next: (true | false | nil)
+ }
+
# Start time in milliseconds.
#
def start_time: () -> Integer
def end_time: () -> Integer
# Whether the next segment is predicted as a speaker turn.
- def speaker_next_turn?: () -> (true | false)
+ def speaker_turn_next?: () -> (true | false)
def text: () -> String
def no_speech_prob: () -> Float
+ def to_srt_cue: () -> String
+ def to_webvtt_cue: () -> String
+
+ # Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
+ #
+ # whisper.each_segment do |segment|
+ # segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
+ #
+ # puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
+ # end
+ def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next] | nil) -> deconstructed_keys
end
module VAD
end
whisper.transcribe(AUDIO, params)
end
+
+ def test_pattern_matching
+ segment = whisper.each_segment.first
+ segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
+
+ assert_equal segment.start_time, start_time
+ assert_equal segment.end_time, end_time
+ assert_equal segment.text, text
+ assert_equal segment.no_speech_prob, no_speech_prob
+ assert_equal segment.speaker_turn_next?, speaker_turn_next
+ end
+
+ def test_pattern_matching_partial
+ segment = whisper.each_segment.first
+ segment => {start_time:, end_time:, text:}
+
+ assert_equal segment.start_time, start_time
+ assert_equal segment.end_time, end_time
+ assert_equal segment.text, text
+ end
+
+ def test_deconstruct_keys
+ segment = whisper.each_segment.first
+ expected = {
+ start_time: segment.start_time,
+ end_time: segment.end_time,
+ text: segment.text,
+ no_speech_prob: segment.no_speech_prob,
+ speaker_turn_next: segment.speaker_turn_next?
+ }
+ assert_equal expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next])
+ end
+
+ def test_deconstruct_keys_non_existent
+ omit "Undefined behavior"
+
+ segment = whisper.each_segment.first
+
+ assert_equal({}, segment.deconstruct_keys([:non_existent]))
+ end
+
+ def test_deconstruct_keys_too_many_keys
+ omit "Undefined behavior"
+
+ segment = whisper.each_segment.first
+
+ assert_equal({}, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next, :extra_key]))
+ end
+
+ def test_deconstruct_keys_includes_non_existent_keys_not_too_many
+ omit "Undefined behavior"
+
+ segment = whisper.each_segment.first
+
+ expected = {
+ start_time: segment.start_time,
+ end_time: segment.end_time,
+ text: segment.text,
+ no_speech_prob: segment.no_speech_prob
+ }
+ assert_equal(expected, segment.deconstruct_keys([:start_time, :end_time, :text, :no_speech_prob, :non_existent]))
+ end
end
end
def test_system_info_str
- assert_match /\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str
+ assert_match(/\AWHISPER : COREML = \d | OPENVINO = \d |/, Whisper.system_info_str)
end
def test_log_set
assert_match(/for your country/i, text)
end
end
+
+ def test_to_srt
+ whisper = Whisper::Context.new("base.en")
+ whisper.transcribe AUDIO, @params
+
+ lines = whisper.to_srt.lines
+ assert_match(/\A\d+\n/, lines[0])
+ assert_match(/\d{2}:\d{2}:\d{2},\d{3} --> \d{2}:\d{2}:\d{2},\d{3}\n/, lines[1])
+ assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[2])
+ end
+
+ def test_to_webvtt
+ whisper = Whisper::Context.new("base.en")
+ whisper.transcribe AUDIO, @params
+
+ lines = whisper.to_webvtt.lines
+ assert_equal "WEBVTT\n", lines[0]
+ assert_equal "\n", lines[1]
+ assert_match(/\A\d+\n/, lines[2])
+ assert_match(/\d{2}:\d{2}:\d{2}\.\d{3} --> \d{2}:\d{2}:\d{2}\.\d{3}\n/, lines[3])
+ assert_match(/ask not what your country can do for you, ask what you can do for your country/, lines[4])
+ end
+
+ sub_test_case "Format needs escape" do
+ def setup
+ @whisper = Whisper::Context.new("base.en")
+ @whisper.transcribe AUDIO, Whisper::Params.new
+ segment = @whisper.each_segment.first
+ segment.define_singleton_method :text do
+ "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country."
+ end
+ @whisper.define_singleton_method :each_segment do
+ Enumerator.new(3) {|yielder| 3.times {yielder << segment}}
+ end
+ end
+
+ def test_to_srt_escape
+ assert_equal "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country.\n", @whisper.to_srt.lines[2]
+ end
+
+ def test_to_webvtt_escape
+ assert_equal "& so my fellow Americans --> ask not what your country can do for you <-- ask what you can do for your country.\n", @whisper.to_webvtt.lines[4]
+ end
+ end
end
s.name = "whispercpp"
s.authors = ["Georgi Gerganov", "Todd A. Fisher"]
s.version = '1.3.3'
- s.date = '2025-06-03'
+ s.date = '2025-06-10'
s.description = %q{High-performance inference of OpenAI's Whisper automatic speech recognition (ASR) model via Ruby}
s.email = 'todd.fisher@gmail.com'
s.extra_rdoc_files = ['LICENSE', 'README.md']