Ruby bindings for [whisper.cpp][], an interface of automatic speech recognition model.
-Installation
-------------
-
-Install the gem and add to the application's Gemfile by executing:
-
- $ bundle add whispercpp
-
-If bundler is not being used to manage dependencies, install the gem by executing:
-
- $ gem install whispercpp
-
-You can pass build options for whisper.cpp, for instance:
-
- $ bundle config build.whispercpp --enable-ggml-cuda
-
-or,
-
- $ gem install whispercpp -- --enable-ggml-cuda
-
-See whisper.cpp's [README](https://github.com/ggml-org/whisper.cpp/blob/master/README.md) for available options. You need convert options present the README to Ruby-style options, for example:
-
-Boolean options:
-
-* `-DGGML_BLAS=1` -> `--enable-ggml-blas`
-* `-DWHISER_COREML=OFF` -> `--disable-whisper-coreml`
-
-Argument options:
-
-* `-DGGML_CUDA_COMPRESSION_MODE=size` -> `--ggml-cuda-compression-mode=size`
-
-Combination:
-
-* `-DGGML_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES="86"` -> `--enable-ggml-cuda --cmake_cuda-architectures="86"`
-
-For boolean options like `GGML_CUDA`, the README says `-DGGML_CUDA=1`. You need strip `-D`, prepend `--enable-` for `1` or `ON` (`--disable-` for `0` or `OFF`) and make it kebab-case: `--enable-ggml-cuda`.
-For options which require arguments like `CMAKE_CUDA_ARCHITECTURES`, the README says `-DCMAKE_CUDA_ARCHITECTURES="86"`. You need strip `-D`, prepend `--`, make it kebab-case, append `=` and append argument: `--cmake-cuda-architectures="86"`.
-
Usage
-----
max_text_tokens: 300,
translate: true,
print_timestamps: false,
- initial_prompt: "Initial prompt here."
+ initial_prompt: "Initial prompt here.",
+ carry_initial_prompt: true
)
whisper.transcribe("path/to/audio.wav", params) do |whole_text|
```ruby
whisper = Whisper::Context.new("https://example.net/uri/of/your/model.bin")
# Or
-whisper = Whisper::Context.new(URI("https://example.net/uri/of/your/model.bin"))
+uri = URI("https://example.net/uri/of/your/model.bin")
+whisper = Whisper::Context.new(uri)
```
See [models][] page for details.
You may call `#to_srt`, too
+Installation
+------------
+
+Install the gem and add to the application's Gemfile by executing:
+
+ $ bundle add whispercpp
+
+If bundler is not being used to manage dependencies, install the gem by executing:
+
+ $ gem install whispercpp
+
+You can pass build options for whisper.cpp, for instance:
+
+ $ bundle config build.whispercpp --enable-ggml-cuda
+
+or,
+
+ $ gem install whispercpp -- --enable-ggml-cuda
+
+See whisper.cpp's [README](https://github.com/ggml-org/whisper.cpp/blob/master/README.md) for available options. You need convert options present in the README to Ruby-style options, for example:
+
+Boolean options:
+
+* `-DGGML_BLAS=1` -> `--enable-ggml-blas`
+* `-DWHISER_COREML=OFF` -> `--disable-whisper-coreml`
+
+Argument options:
+
+* `-DGGML_CUDA_COMPRESSION_MODE=size` -> `--ggml-cuda-compression-mode=size`
+
+Combination:
+
+* `-DGGML_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES="86"` -> `--enable-ggml-cuda --cmake_cuda-architectures="86"`
+
+For boolean options like `GGML_CUDA`, the README says `-DGGML_CUDA=1`. You need strip `-D`, prepend `--enable-` for `1` or `ON` (`--disable-` for `0` or `OFF`) and make it kebab-case: `--enable-ggml-cuda`.
+For options which require arguments like `CMAKE_CUDA_ARCHITECTURES`, the README says `-DCMAKE_CUDA_ARCHITECTURES="86"`. You need strip `-D`, prepend `--`, make it kebab-case, append `=` and append argument: `--cmake-cuda-architectures="86"`.
API
---
require_relative "dependencies"
cmake = find_executable("cmake") || abort
-options = Options.new(cmake)
+options = Options.new(cmake).to_s
have_library("gomp") rescue nil
-libs = Dependencies.new(cmake, options)
+libs = Dependencies.new(cmake, options).to_s
$INCFLAGS << " -Isources/include -Isources/ggml/include -Isources/examples"
$LOCAL_LIBS << " #{libs}"
VALUE eError;
VALUE cSegment;
+VALUE cToken;
VALUE cModel;
ID id_to_s;
extern void init_ruby_whisper_context(VALUE *mWhisper);
extern void init_ruby_whisper_params(VALUE *mWhisper);
extern void init_ruby_whisper_error(VALUE *mWhisper);
-extern void init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cSegment);
+extern void init_ruby_whisper_segment(VALUE *mWhisper);
+extern void init_ruby_whisper_token(VALUE *mWhisper);
extern void init_ruby_whisper_model(VALUE *mWhisper);
extern void init_ruby_whisper_vad_params(VALUE *mVAD);
extern void init_ruby_whisper_vad_context(VALUE *mVAD);
init_ruby_whisper_context(&mWhisper);
init_ruby_whisper_params(&mWhisper);
init_ruby_whisper_error(&mWhisper);
- init_ruby_whisper_segment(&mWhisper, &cContext);
+ init_ruby_whisper_segment(&mWhisper);
+ init_ruby_whisper_token(&mWhisper);
init_ruby_whisper_model(&mWhisper);
init_ruby_whisper_vad_params(&mVAD);
init_ruby_whisper_vad_segment(&mVAD);
int index;
} ruby_whisper_segment;
+typedef struct {
+ whisper_token_data *token_data;
+ const char *text;
+} ruby_whisper_token;
+
typedef struct {
VALUE context;
} ruby_whisper_model;
struct whisper_vad_context *context;
} ruby_whisper_vad_context;
+#define GetContext(obj, rw) do { \
+ TypedData_Get_Struct((obj), ruby_whisper, &ruby_whisper_type, (rw)); \
+ if ((rw)->context == NULL) { \
+ rb_raise(rb_eRuntimeError, "Not initialized"); \
+ } \
+} while (0)
+
+#define GetToken(obj, rwt) do { \
+ TypedData_Get_Struct((obj), ruby_whisper_token, &ruby_whisper_token_type, (rwt)); \
+ if ((rwt)->token_data == NULL) { \
+ rb_raise(rb_eRuntimeError, "Not initialized"); \
+ } \
+} while (0)
+
+#define GetVADSegments(obj, rwvss) do { \
+ TypedData_Get_Struct((obj), ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, (rwvss)); \
+ if ((rwvss)->segments == NULL) { \
+ rb_raise(rb_eRuntimeError, "Not initialized"); \
+ } \
+} while (0)
+
#endif
VALUE ruby_whisper_model_n_vocab(VALUE self)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
return INT2NUM(whisper_model_n_vocab(rw->context));
}
VALUE ruby_whisper_model_n_audio_ctx(VALUE self)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
return INT2NUM(whisper_model_n_audio_ctx(rw->context));
}
VALUE ruby_whisper_model_n_audio_state(VALUE self)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
return INT2NUM(whisper_model_n_audio_state(rw->context));
}
VALUE ruby_whisper_model_n_audio_head(VALUE self)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
return INT2NUM(whisper_model_n_audio_head(rw->context));
}
VALUE ruby_whisper_model_n_audio_layer(VALUE self)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
return INT2NUM(whisper_model_n_audio_layer(rw->context));
}
VALUE ruby_whisper_model_n_text_ctx(VALUE self)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
return INT2NUM(whisper_model_n_text_ctx(rw->context));
}
VALUE ruby_whisper_model_n_text_state(VALUE self)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
return INT2NUM(whisper_model_n_text_state(rw->context));
}
VALUE ruby_whisper_model_n_text_head(VALUE self)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
return INT2NUM(whisper_model_n_text_head(rw->context));
}
VALUE ruby_whisper_model_n_text_layer(VALUE self)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
return INT2NUM(whisper_model_n_text_layer(rw->context));
}
VALUE ruby_whisper_model_n_mels(VALUE self)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
return INT2NUM(whisper_model_n_mels(rw->context));
}
VALUE ruby_whisper_model_ftype(VALUE self)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
return INT2NUM(whisper_model_ftype(rw->context));
}
VALUE ruby_whisper_model_type(VALUE self)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
return rb_str_new2(whisper_model_type_readable(rw->context));
}
ruby_whisper *rw;
ruby_whisper_params *rwp;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
VALUE params = argv[0];
TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
VALUE samples = argv[1];
ruby_whisper *rw;
ruby_whisper_params *rwp;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
VALUE params = argv[0];
TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
VALUE samples = argv[1];
ruby_whisper_full_n_segments(VALUE self)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
return INT2NUM(whisper_full_n_segments(rw->context));
}
ruby_whisper_full_lang_id(VALUE self)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
return INT2NUM(whisper_full_lang_id(rw->context));
}
ruby_whisper_full_get_segment_t0(VALUE self, VALUE i_segment)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
const int64_t t0 = whisper_full_get_segment_t0(rw->context, c_i_segment);
return LONG2NUM(t0);
ruby_whisper_full_get_segment_t1(VALUE self, VALUE i_segment)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
const int64_t t1 = whisper_full_get_segment_t1(rw->context, c_i_segment);
return LONG2NUM(t1);
ruby_whisper_full_get_segment_speaker_turn_next(VALUE self, VALUE i_segment)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
const bool speaker_turn_next = whisper_full_get_segment_speaker_turn_next(rw->context, c_i_segment);
return speaker_turn_next ? Qtrue : Qfalse;
ruby_whisper_full_get_segment_text(VALUE self, VALUE i_segment)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
const char * text = whisper_full_get_segment_text(rw->context, c_i_segment);
return rb_str_new2(text);
ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment)
{
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
const float no_speech_prob = whisper_full_get_segment_no_speech_prob(rw->context, c_i_segment);
return DBL2NUM(no_speech_prob);
}
ruby_whisper *rw;
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
const int n_segments = whisper_full_n_segments(rw->context);
for (int i = 0; i < n_segments; ++i) {
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
- TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rwm->context, rw);
return INT2NUM(whisper_model_n_vocab(rw->context));
}
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
- TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rwm->context, rw);
return INT2NUM(whisper_model_n_audio_ctx(rw->context));
}
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
- TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rwm->context, rw);
return INT2NUM(whisper_model_n_audio_state(rw->context));
}
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
- TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rwm->context, rw);
return INT2NUM(whisper_model_n_audio_head(rw->context));
}
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
- TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rwm->context, rw);
return INT2NUM(whisper_model_n_audio_layer(rw->context));
}
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
- TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rwm->context, rw);
return INT2NUM(whisper_model_n_text_ctx(rw->context));
}
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
- TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rwm->context, rw);
return INT2NUM(whisper_model_n_text_state(rw->context));
}
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
- TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rwm->context, rw);
return INT2NUM(whisper_model_n_text_head(rw->context));
}
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
- TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rwm->context, rw);
return INT2NUM(whisper_model_n_text_layer(rw->context));
}
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
- TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rwm->context, rw);
return INT2NUM(whisper_model_n_mels(rw->context));
}
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
- TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rwm->context, rw);
return INT2NUM(whisper_model_ftype(rw->context));
}
ruby_whisper_model *rwm;
TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
ruby_whisper *rw;
- TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rwm->context, rw);
return rb_str_new2(whisper_model_type_readable(rw->context));
}
}
/*
* If true, prints results from within whisper.cpp. (avoid it, use callback instead)
+ *
* call-seq:
* print_realtime -> bool
*/
#include <ruby.h>
#include "ruby_whisper.h"
-#define N_KEY_NAMES 5
+#define N_KEY_NAMES 6
+extern ID id___method__;
+extern ID id_to_enum;
static VALUE sym_start_time;
static VALUE sym_end_time;
static VALUE sym_text;
static VALUE sym_no_speech_prob;
static VALUE sym_speaker_turn_next;
+static VALUE sym_n_tokens;
static VALUE key_names;
extern const rb_data_type_t ruby_whisper_type;
extern VALUE cSegment;
+extern VALUE ruby_whisper_token_s_init(struct whisper_context *context, int i_segment, int index);
+
static void
rb_whisper_segment_mark(void *p)
{
ruby_whisper_segment *rws;
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
ruby_whisper *rw;
- TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rws->context, rw);
const int64_t t0 = whisper_full_get_segment_t0(rw->context, rws->index);
// able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
return LONG2NUM(t0 * 10);
ruby_whisper_segment *rws;
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
ruby_whisper *rw;
- TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rws->context, rw);
const int64_t t1 = whisper_full_get_segment_t1(rw->context, rws->index);
// able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
return LONG2NUM(t1 * 10);
ruby_whisper_segment *rws;
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
ruby_whisper *rw;
- TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rws->context, rw);
return whisper_full_get_segment_speaker_turn_next(rw->context, rws->index) ? Qtrue : Qfalse;
}
ruby_whisper_segment *rws;
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
ruby_whisper *rw;
- TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rws->context, rw);
const char * text = whisper_full_get_segment_text(rw->context, rws->index);
return rb_str_new2(text);
}
ruby_whisper_segment *rws;
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
ruby_whisper *rw;
- TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rws->context, rw);
return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
}
+/*
+ * Get number of tokens in the segment
+ *
+ * call-seq:
+ * n_tokens -> Integer
+ */
+static VALUE
+ruby_whisper_segment_get_n_tokens(VALUE self)
+{
+ ruby_whisper_segment *rws;
+ TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
+ ruby_whisper *rw;
+ GetContext(rws->context, rw);
+ return INT2NUM(whisper_full_n_tokens(rw->context, rws->index));
+}
+
+/*
+ * Yields each Whisper::Token:
+ *
+ * whisper.each_segment.first.each_token do |token|
+ * p token
+ * end
+ *
+ * Returns an Enumerator if no block is given:
+ *
+ * whisper.each_segment.first.each_token.to_a # => [#<Whisper::Token>, ...]
+ *
+ * call-seq:
+ * each_token {|token| ... }
+ * each_token -> Enumerator
+ */
+static VALUE
+ruby_whisper_segment_each_token(VALUE self)
+{
+ if (!rb_block_given_p()) {
+ const VALUE method_name = rb_funcall(self, id___method__, 0);
+ return rb_funcall(self, id_to_enum, 1, method_name);
+ }
+
+ ruby_whisper_segment *rws;
+ TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
+ ruby_whisper *rw;
+ GetContext(rws->context, rw);
+
+ const int n_tokens = whisper_full_n_tokens(rw->context, rws->index);
+ for (int i = 0; i < n_tokens; ++i) {
+ rb_yield(ruby_whisper_token_s_init(rw->context, rws->index, i));
+ }
+
+ return self;
+}
+
/*
* call-seq:
* deconstruct_keys(keys) -> hash
*
- * Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
+ * Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next, :n_tokens
*
* whisper.each_segment do |segment|
* segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
ruby_whisper_segment *rws;
TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
ruby_whisper *rw;
- TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(rws->context, rw);
VALUE hash = rb_hash_new();
long n_keys;
if (key == sym_speaker_turn_next) {
rb_hash_aset(hash, key, ruby_whisper_segment_get_speaker_turn_next(self));
}
+ if (key == sym_n_tokens) {
+ rb_hash_aset(hash, key, ruby_whisper_segment_get_n_tokens(self));
+ }
}
return hash;
}
void
-init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
+init_ruby_whisper_segment(VALUE *mWhisper)
{
cSegment = rb_define_class_under(*mWhisper, "Segment", rb_cObject);
sym_text = ID2SYM(rb_intern("text"));
sym_no_speech_prob = ID2SYM(rb_intern("no_speech_prob"));
sym_speaker_turn_next = ID2SYM(rb_intern("speaker_turn_next"));
+ sym_n_tokens = ID2SYM(rb_intern("n_tokens"));
key_names = rb_ary_new3(
N_KEY_NAMES,
sym_start_time,
rb_define_method(cSegment, "speaker_turn_next?", ruby_whisper_segment_get_speaker_turn_next, 0);
rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
+ rb_define_method(cSegment, "n_tokens", ruby_whisper_segment_get_n_tokens, 0);
+ rb_define_method(cSegment, "each_token", ruby_whisper_segment_each_token, 0);
rb_define_method(cSegment, "deconstruct_keys", ruby_whisper_segment_deconstruct_keys, 1);
}
+#undef N_KEY_NAMES
--- /dev/null
+#include <ruby.h>
+#include "ruby_whisper.h"
+
+#define N_KEY_NAMES 11
+
+extern VALUE cToken;
+extern const rb_data_type_t ruby_whisper_type;
+
+static VALUE key_names;
+static VALUE sym_id;
+static VALUE sym_tid;
+static VALUE sym_probability;
+static VALUE sym_log_probability;
+static VALUE sym_pt;
+static VALUE sym_ptsum;
+static VALUE sym_t_dtw;
+static VALUE sym_voice_length;
+static VALUE sym_start_time;
+static VALUE sym_end_time;
+static VALUE sym_text;
+
+static size_t
+ruby_whisper_token_memsize(const void *p)
+{
+ const ruby_whisper_token *rwt = (const ruby_whisper_token *)p;
+ if (!rwt) {
+ return 0;
+ }
+ return sizeof(rwt);
+}
+
+static const rb_data_type_t ruby_whisper_token_type = {
+ "ruby_whisper_token",
+ {0, RUBY_DEFAULT_FREE, ruby_whisper_token_memsize,},
+ 0, 0,
+ 0
+};
+
+static VALUE
+ruby_whisper_token_allocate(VALUE klass)
+{
+ ruby_whisper_token *rwt;
+ VALUE token = TypedData_Make_Struct(klass, ruby_whisper_token, &ruby_whisper_token_type, rwt);
+ rwt->token_data = NULL;
+ rwt->text = NULL;
+ return token;
+}
+
+VALUE
+ruby_whisper_token_s_init(struct whisper_context *context, int i_segment, int i_token)
+{
+ whisper_token_data token_data = whisper_full_get_token_data(context, i_segment, i_token);
+ const VALUE token = ruby_whisper_token_allocate(cToken);
+ ruby_whisper_token *rwt;
+ TypedData_Get_Struct(token, ruby_whisper_token, &ruby_whisper_token_type, rwt);
+ rwt->token_data = &token_data;
+ rwt->text = whisper_full_get_token_text(context, i_segment, i_token);
+ return token;
+}
+
+/*
+ * Token ID.
+ *
+ * call-seq:
+ * id -> Integer
+ */
+static VALUE
+ruby_whisper_token_get_id(VALUE self)
+{
+ ruby_whisper_token *rwt;
+ GetToken(self, rwt);
+ return INT2NUM(rwt->token_data->id);
+}
+
+/*
+ * Forced timestamp token ID.
+ *
+ * call-seq:
+ * tid -> Integer
+ */
+static VALUE
+ruby_whisper_token_get_tid(VALUE self)
+{
+ ruby_whisper_token *rwt;
+ GetToken(self, rwt);
+ return INT2NUM(rwt->token_data->tid);
+}
+
+/*
+ * Probability of the token.
+ *
+ * call-seq:
+ * probability -> Float
+ */
+static VALUE
+ruby_whisper_token_get_p(VALUE self)
+{
+ ruby_whisper_token *rwt;
+ GetToken(self, rwt);
+ return DBL2NUM(rwt->token_data->p);
+}
+
+/*
+ * Log probability of the token.
+ *
+ * call-seq:
+ * log_probability -> Float
+ */
+static VALUE
+ruby_whisper_token_get_plog(VALUE self)
+{
+ ruby_whisper_token *rwt;
+ GetToken(self, rwt);
+ return DBL2NUM(rwt->token_data->plog);
+}
+
+/*
+ * Probability of the timestamp token.
+ *
+ * call-seq:
+ * pt -> Float
+ */
+static VALUE
+ruby_whisper_token_get_pt(VALUE self)
+{
+ ruby_whisper_token *rwt;
+ GetToken(self, rwt);
+ return DBL2NUM(rwt->token_data->pt);
+}
+
+/*
+ * Sum of probability of all timestamp tokens.
+ *
+ * call-seq:
+ * ptsum -> Float
+ */
+static VALUE
+ruby_whisper_token_get_ptsum(VALUE self)
+{
+ ruby_whisper_token *rwt;
+ GetToken(self, rwt);
+ return DBL2NUM(rwt->token_data->ptsum);
+}
+
+/*
+ * [EXPERIMENTAL] Token-level timestamps with DTW
+ *
+ * Do not use if you haven't computed token-level timestamps with dtw.
+ * Roughly corresponds to the moment in audio in which the token was output.
+ *
+ * call-seq:
+ * t_dtw -> Integer
+ */
+static VALUE
+ruby_whisper_token_get_t_dtw(VALUE self)
+{
+ ruby_whisper_token *rwt;
+ GetToken(self, rwt);
+ return LONG2NUM(rwt->token_data->t_dtw);
+}
+
+/*
+ * Voice length of the token.
+ *
+ * call-seq:
+ * voice_length -> Float
+ */
+static VALUE
+ruby_whisper_token_get_vlen(VALUE self)
+{
+ ruby_whisper_token *rwt;
+ GetToken(self, rwt);
+ return DBL2NUM(rwt->token_data->vlen);
+}
+
+/*
+ * Get the token text of the token.
+ *
+ * call-seq:
+ * text -> String
+ */
+static VALUE
+ruby_whisper_token_get_text(VALUE self)
+{
+ ruby_whisper_token *rwt;
+ GetToken(self, rwt);
+ return rb_str_new2(rwt->text);
+}
+
+
+/*
+ * Start time of the token.
+ *
+ * Token-level timestamp data.
+ * Do not use if you haven't computed token-level timestamps.
+ *
+ * call-seq:
+ * start_time -> Integer
+ */
+static VALUE
+ruby_whisper_token_get_start_time(VALUE self)
+{
+ ruby_whisper_token *rwt;
+ GetToken(self, rwt);
+ return LONG2NUM(rwt->token_data->t0 * 10);
+}
+
+/*
+ * End time of the token.
+ *
+ * Token-level timestamp data.
+ * Do not use if you haven't computed token-level timestamps.
+ *
+ * call-seq:
+ * end_time -> Integer
+ */
+static VALUE
+ruby_whisper_token_get_end_time(VALUE self)
+{
+ ruby_whisper_token *rwt;
+ GetToken(self, rwt);
+ return LONG2NUM(rwt->token_data->t1 * 10);
+}
+
+/*
+ * call-seq:
+ * deconstruct_keys(keys) -> hash
+ *
+ * Possible keys: :id, :tid, :probability, :log_probability, :pt, :ptsum,
+ * :t_dtw, :voice_length, :start_time, :end_time, :text
+ * segment.each_token do |token|
+ * token => {text:, probability:}
+ puts "#{text} (#{probability})"
+ * end
+ */
+static VALUE ruby_whisper_token_deconstruct_keys(VALUE self, VALUE keys)
+{
+ ruby_whisper_token *rwt;
+ GetToken(self, rwt);
+ VALUE hash = rb_hash_new();
+ long n_keys = 0;
+
+ if (NIL_P(keys)) {
+ keys = key_names;
+ n_keys = N_KEY_NAMES;
+ } else {
+ n_keys = RARRAY_LEN(keys);
+ if (n_keys > N_KEY_NAMES) {
+ return hash;
+ }
+ }
+
+ for (int i = 0; i < n_keys; i++) {
+ VALUE key = rb_ary_entry(keys, i);
+ if (key == sym_start_time) {
+ rb_hash_aset(hash, key, ruby_whisper_token_get_start_time(self));
+ continue;
+ }
+ if (key == sym_end_time) {
+ rb_hash_aset(hash, key, ruby_whisper_token_get_end_time(self));
+ continue;
+ }
+ if (key == sym_text) {
+ rb_hash_aset(hash, key, ruby_whisper_token_get_text(self));
+ continue;
+ }
+ if (key == sym_probability) {
+ rb_hash_aset(hash, key, ruby_whisper_token_get_p(self));
+ continue;
+ }
+ if (key == sym_id) {
+ rb_hash_aset(hash, key, ruby_whisper_token_get_id(self));
+ continue;
+ }
+ if (key == sym_tid) {
+ rb_hash_aset(hash, key, ruby_whisper_token_get_tid(self));
+ continue;
+ }
+ if (key == sym_log_probability) {
+ rb_hash_aset(hash, key, ruby_whisper_token_get_plog(self));
+ continue;
+ }
+ if (key == sym_pt) {
+ rb_hash_aset(hash, key, ruby_whisper_token_get_pt(self));
+ continue;
+ }
+ if (key == sym_ptsum) {
+ rb_hash_aset(hash, key, ruby_whisper_token_get_ptsum(self));
+ continue;
+ }
+ if (key == sym_t_dtw) {
+ rb_hash_aset(hash, key, ruby_whisper_token_get_t_dtw(self));
+ continue;
+ }
+ if (key == sym_voice_length) {
+ rb_hash_aset(hash, key, ruby_whisper_token_get_vlen(self));
+ continue;
+ }
+ }
+
+ return hash;
+}
+
+
+void
+init_ruby_whisper_token(VALUE *mWhisper)
+{
+ cToken = rb_define_class_under(*mWhisper, "Token", rb_cObject);
+
+ rb_define_alloc_func(cToken, ruby_whisper_token_allocate);
+
+ sym_id = ID2SYM(rb_intern("id"));
+ sym_tid = ID2SYM(rb_intern("tid"));
+ sym_probability = ID2SYM(rb_intern("probability"));
+ sym_log_probability = ID2SYM(rb_intern("log_probability"));
+ sym_pt = ID2SYM(rb_intern("pt"));
+ sym_ptsum = ID2SYM(rb_intern("ptsum"));
+ sym_t_dtw = ID2SYM(rb_intern("t_dtw"));
+ sym_voice_length = ID2SYM(rb_intern("voice_length"));
+ sym_start_time = ID2SYM(rb_intern("start_time"));
+ sym_end_time = ID2SYM(rb_intern("end_time"));
+ sym_text = ID2SYM(rb_intern("text"));
+ key_names = rb_ary_new3(
+ N_KEY_NAMES,
+ sym_id,
+ sym_tid,
+ sym_probability,
+ sym_log_probability,
+ sym_pt,
+ sym_ptsum,
+ sym_t_dtw,
+ sym_voice_length,
+ sym_start_time,
+ sym_end_time,
+ sym_text
+ );
+
+ rb_define_method(cToken, "id", ruby_whisper_token_get_id, 0);
+ rb_define_method(cToken, "tid", ruby_whisper_token_get_tid, 0);
+ rb_define_method(cToken, "probability", ruby_whisper_token_get_p, 0);
+ rb_define_method(cToken, "log_probability", ruby_whisper_token_get_plog, 0);
+ rb_define_method(cToken, "pt", ruby_whisper_token_get_pt, 0);
+ rb_define_method(cToken, "ptsum", ruby_whisper_token_get_ptsum, 0);
+ rb_define_method(cToken, "t_dtw", ruby_whisper_token_get_t_dtw, 0);
+ rb_define_method(cToken, "voice_length", ruby_whisper_token_get_vlen, 0);
+ rb_define_method(cToken, "start_time", ruby_whisper_token_get_start_time, 0);
+ rb_define_method(cToken, "end_time", ruby_whisper_token_get_end_time, 0);
+ rb_define_method(cToken, "text", ruby_whisper_token_get_text, 0);
+
+ rb_define_method(cToken, "deconstruct_keys", ruby_whisper_token_deconstruct_keys, 1);
+}
+
+#undef N_KEY_NAMES
int n_processors = opts[0] == Qundef ? 1 : NUM2INT(opts[0]);
- TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+ GetContext(self, rw);
TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
if (!rb_respond_to(wave_file_path, id_to_s)) {
return rb_funcall(self, id_to_enum, 1, method_name);
}
- TypedData_Get_Struct(self, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
- if (rwvss->segments == NULL) {
- rb_raise(rb_eRuntimeError, "Doesn't have reference to segments internally");
- }
+ GetVADSegments(self, rwvss);
n_segments = whisper_vad_segments_n_segments(rwvss->segments);
for (i = 0; i < n_segments; ++i) {
rb_yield(rb_whisper_vad_segment_s_new(self, i));
ruby_whisper_vad_segments *rwvss;
int n_segments;
- TypedData_Get_Struct(self, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
- if (rwvss->segments == NULL) {
- rb_raise(rb_eRuntimeError, "Doesn't have reference to segments internally");
- }
+ GetVADSegments(self, rwvss);
n_segments = whisper_vad_segments_n_segments(rwvss->segments);
return INT2NUM(n_segments);
base-q8_0
small
small.en
- small.en-tdrz
small-q5_1
small.en-q5_1
small-q8_0
models[name] = URI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}.bin")
}
+ %w[
+ small.en-tdrz
+ ].each do |name|
+ @pre_converted_models[name] = URI.new("https://huggingface.co/akashmjn/tinydiarize-whisper.cpp/resolve/main/ggml-#{name}.bin")
+ end
+
%w[
silero-v5.1.2
silero-v6.2.0
@pre_converted_models[name] = URI.new("https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-#{name}.bin")
end
- @coreml_compiled_models = %w[
- tiny
- tiny.en
- base
- base.en
- small
- small.en
- medium
- medium.en
- large-v1
- large-v2
- large-v3
- large-v3-turbo
- ].each_with_object({}) do |name, models|
- models[@pre_converted_models[name]] = ZipURI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}-encoder.mlmodelc.zip")
- end
+ @coreml_compiled_models = @pre_converted_models.each_with_object({}) {|(name, uri), models|
+ next if name.end_with?("-tdrz") || name.start_with?("silero-")
+
+ if matched = name.match(/\A(?<name>.*)-q\d_\d\z/)
+ name = matched[:name]
+ end
+ models[uri] = ZipURI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}-encoder.mlmodelc.zip")
+ }
class << self
attr_reader :pre_converted_models, :coreml_compiled_models
end_time: (Integer | nil),
text: (String | nil),
no_speech_prob: (Float | nil),
- speaker_turn_next: (true | false | nil)
+ speaker_turn_next: (true | false | nil),
+ n_tokens: (Integer | nil)
}
# Start time in milliseconds.
def end_time: () -> Integer
# Whether the next segment is predicted as a speaker turn.
+ #
def speaker_turn_next?: () -> (true | false)
def text: () -> String
def no_speech_prob: () -> Float
+
+ # Get number of tokens in the segment
+ #
+ def n_tokens: () -> Integer
+
+ # Yields each Whisper::Token:
+ #
+ # whisper.each_segment.first.each_token do |token|
+ # p token
+ # end
+ #
+ # Returns an Enumerator if no block is given:
+ #
+ # whisper.each_segment.first.each_token.to_a # => [#<Whisper::Token>, ...]
+ #
+ def each_token: { (Token) -> void } -> void
+ | () -> Enumerator[Token]
def to_srt_cue: () -> String
def to_webvtt_cue: () -> String
+
# Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
#
# whisper.each_segment do |segment|
#
# puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
# end
- def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next] | nil) -> deconstructed_keys
+ def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next | :n_tokens] | nil) -> deconstructed_keys
+ end
+
+ module Token
+ type deconstructed_keys = {
+ id: (Integer | nil),
+ tid: (Integer | nil),
+ probability: (Float | nil),
+ log_probability: (Float | nil),
+ pt: (Float | nil),
+ ptsum: (Float | nil),
+ t_dtw: (Integer | nil),
+ voice_length: (Float | nil),
+ text: (String | nil),
+ start_time: (Integer | nil),
+ end_time: (Integer | nil),
+ }
+
+ # Token ID.
+ #
+ def id: () -> Integer
+
+ # Forced timestamp token ID.
+ #
+ def tid: () -> Integer
+
+ # Probability of the token.
+ #
+ def probability: () -> Float
+
+ # Log probability of the token.
+ #
+ def log_probability: () -> Float
+
+ # Probability of the timestamp token.
+ #
+ def pt: () -> Float
+
+ # Sum of probability of all timestamp tokens.
+ #
+ def ptsum: () -> Float
+
+ # [EXPERIMENTAL] Token-level timestamps with DTW
+ #
+ # Do not use if you haven't computed token-level timestamps with dtw.
+ # Roughly corresponds to the moment in audio in which the token was output.
+ #
+ def t_dtw: () -> Integer
+
+ # Voice length of the token.
+ #
+ def voice_length: () -> Float
+
+ # Start time of the token.
+ #
+ # Token-level timestamp data.
+ # Do not use if you haven't computed token-level timestamps.
+ #
+ def start_time: () -> Integer
+
+ # End time of the token.
+ #
+ # Token-level timestamp data.
+ # Do not use if you haven't computed token-level timestamps.
+ #
+ def end_time: () -> Integer
+
+ # Get the token text of the token.
+ #
+ def text: () -> String
+ def deconstruct_keys: (Array[:id | :tid | :probability | :log_probability | :pt | :ptsum | :t_dtw | :voice_length | :start_time | :end_time | :text] | nil) -> deconstructed_keys
end
module VAD
--- /dev/null
+require_relative "helper"
+
+class TestToken < TestBase
+ def setup
+ @segment = whisper.each_segment.first
+ @token = @segment.each_token.first
+ end
+
+ def test_n_tokens
+ assert_equal 27, @segment.n_tokens
+ end
+
+ def test_allocate
+ token = Whisper::Token.allocate
+ assert_raise do
+ token.id
+ end
+ end
+
+ def test_each_token
+ i = 0
+ @segment.each_token do |token|
+ i += 1
+ assert_instance_of Whisper::Token, token
+ end
+ assert_equal 27, i
+ end
+
+ def test_each_token_without_block
+ assert_instance_of Enumerator, @segment.each_token
+ end
+
+ def test_token
+ assert_instance_of Whisper::Token, @token
+
+ assert_instance_of Integer, @token.id
+ assert_instance_of Float, @token.probability
+ assert_instance_of Float, @token.log_probability
+
+ assert_instance_of Integer, @token.tid
+ assert_instance_of Float, @token.pt
+ assert_instance_of Float, @token.ptsum
+
+ assert_instance_of Integer, @token.start_time
+ assert_instance_of Integer, @token.end_time
+
+ assert_instance_of Integer, @token.t_dtw
+
+ assert_instance_of Float, @token.voice_length
+
+ assert_instance_of String, @token.text
+ end
+
+ def test_text
+ assert_equal ["[_BEG_]", " And", " so", " my", " fellow", " Americans", ",", " ask", " not", " what", " your", " country", " can", " do", " for", " you", ",", " ask", " what", " you", " can", " do", " for", " your", " country", ".", "[_TT_550]"],
+ @segment.each_token.collect(&:text)
+ end
+
+ def test_deconstruct_keys_with_nil
+ assert_equal({}, @token.deconstruct_keys(nil))
+ end
+
+ def test_deconstruct_keys_with_keys
+ keys = %i[id tid probability log_probability pt ptsum t_dtw voice_length start_time end_time text]
+ expected = keys.collect {|key| [key, @token.send(key)] }.to_h
+ assert_equal expected, @token.deconstruct_keys(keys)
+ end
+end
$stderr = stderr
end
+ def test_access_attribute_without_initialization
+ whisper = Whisper::Context.allocate
+ assert_raise do
+ whisper.model_type
+ end
+ end
+
sub_test_case "full" do
def setup
super