ruby : add Whisper::Token, fix model URI (#3575)

author KITAITI Makoto <redacted>

Wed, 24 Dec 2025 07:52:16 +0000 (16:52 +0900)

committer GitHub <redacted>

Wed, 24 Dec 2025 07:52:16 +0000 (16:52 +0900)
author KITAITI Makoto <redacted>
Wed, 24 Dec 2025 07:52:16 +0000 (16:52 +0900)
committer GitHub <redacted>
Wed, 24 Dec 2025 07:52:16 +0000 (16:52 +0900)
diff --git a/bindings/ruby/README.md b/bindings/ruby/README.md

index 45218667d9613a0fc3c6759ac7fe211bdeafea90..ea202753b677fd3f986362fe447ce3b4247ec33f 100644 (file)
--- a/bindings/ruby/README.md
+++ b/bindings/ruby/README.md
@@ -5,43 +5,6 @@ whispercpp
  
  Ruby bindings for [whisper.cpp][], an interface of automatic speech recognition model.
  
-Installation
-------------
-
-Install the gem and add to the application's Gemfile by executing:
-
-    $ bundle add whispercpp
-
-If bundler is not being used to manage dependencies, install the gem by executing:
-
-    $ gem install whispercpp
-
-You can pass build options for whisper.cpp, for instance:
-
-    $ bundle config build.whispercpp --enable-ggml-cuda
-
-or,
-
-    $ gem install whispercpp -- --enable-ggml-cuda
-
-See whisper.cpp's [README](https://github.com/ggml-org/whisper.cpp/blob/master/README.md) for available options. You need convert options present the README to Ruby-style options, for example:
-
-Boolean options:
-
-* `-DGGML_BLAS=1` -> `--enable-ggml-blas`
-* `-DWHISER_COREML=OFF` -> `--disable-whisper-coreml`
-
-Argument options:
-
-* `-DGGML_CUDA_COMPRESSION_MODE=size` -> `--ggml-cuda-compression-mode=size`
-
-Combination:
-
-* `-DGGML_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES="86"` -> `--enable-ggml-cuda --cmake_cuda-architectures="86"`
-
-For boolean options like `GGML_CUDA`, the README says `-DGGML_CUDA=1`. You need strip `-D`, prepend `--enable-` for `1` or `ON` (`--disable-` for `0` or `OFF`) and make it kebab-case: `--enable-ggml-cuda`.  
-For options which require arguments like `CMAKE_CUDA_ARCHITECTURES`, the README says `-DCMAKE_CUDA_ARCHITECTURES="86"`. You need strip `-D`, prepend `--`, make it kebab-case, append `=` and append argument: `--cmake-cuda-architectures="86"`.
-
  Usage
  -----
  
@@ -57,7 +20,8 @@ params = Whisper::Params.new(
    max_text_tokens: 300,
    translate: true,
    print_timestamps: false,
-  initial_prompt: "Initial prompt here."
+  initial_prompt: "Initial prompt here.",
+  carry_initial_prompt: true
  )
  
  whisper.transcribe("path/to/audio.wav", params) do |whole_text|
@@ -118,7 +82,8 @@ Or, you can download model files:
  ```ruby
  whisper = Whisper::Context.new("https://example.net/uri/of/your/model.bin")
  # Or
-whisper = Whisper::Context.new(URI("https://example.net/uri/of/your/model.bin"))
+uri = URI("https://example.net/uri/of/your/model.bin")
+whisper = Whisper::Context.new(uri)
  ```
  
  See [models][] page for details.
@@ -187,6 +152,42 @@ WEBVTT
  
  You may call `#to_srt`, too
  
+Installation
+------------
+
+Install the gem and add to the application's Gemfile by executing:
+
+    $ bundle add whispercpp
+
+If bundler is not being used to manage dependencies, install the gem by executing:
+
+    $ gem install whispercpp
+
+You can pass build options for whisper.cpp, for instance:
+
+    $ bundle config build.whispercpp --enable-ggml-cuda
+
+or,
+
+    $ gem install whispercpp -- --enable-ggml-cuda
+
+See whisper.cpp's [README](https://github.com/ggml-org/whisper.cpp/blob/master/README.md) for available options. You need convert options present in the README to Ruby-style options, for example:
+
+Boolean options:
+
+* `-DGGML_BLAS=1` -> `--enable-ggml-blas`
+* `-DWHISER_COREML=OFF` -> `--disable-whisper-coreml`
+
+Argument options:
+
+* `-DGGML_CUDA_COMPRESSION_MODE=size` -> `--ggml-cuda-compression-mode=size`
+
+Combination:
+
+* `-DGGML_CUDA=1 -DCMAKE_CUDA_ARCHITECTURES="86"` -> `--enable-ggml-cuda --cmake_cuda-architectures="86"`
+
+For boolean options like `GGML_CUDA`, the README says `-DGGML_CUDA=1`. You need strip `-D`, prepend `--enable-` for `1` or `ON` (`--disable-` for `0` or `OFF`) and make it kebab-case: `--enable-ggml-cuda`.  
+For options which require arguments like `CMAKE_CUDA_ARCHITECTURES`, the README says `-DCMAKE_CUDA_ARCHITECTURES="86"`. You need strip `-D`, prepend `--`, make it kebab-case, append `=` and append argument: `--cmake-cuda-architectures="86"`.
  
  API
  ---
diff --git a/bindings/ruby/ext/extconf.rb b/bindings/ruby/ext/extconf.rb

index edb7b82ff97c738f24af720827398369259dabf9..8a5ac67457b048cb29e6d1d2ea3e1603f8bd012e 100644 (file)
--- a/bindings/ruby/ext/extconf.rb
+++ b/bindings/ruby/ext/extconf.rb
@@ -3,9 +3,9 @@ require_relative "options"
  require_relative "dependencies"
  
  cmake = find_executable("cmake") || abort
-options = Options.new(cmake)
+options = Options.new(cmake).to_s
  have_library("gomp") rescue nil
-libs = Dependencies.new(cmake, options)
+libs = Dependencies.new(cmake, options).to_s
  
  $INCFLAGS << " -Isources/include -Isources/ggml/include -Isources/examples"
  $LOCAL_LIBS << " #{libs}"
diff --git a/bindings/ruby/ext/ruby_whisper.c b/bindings/ruby/ext/ruby_whisper.c

index 59c7818e674e7c74d492d9d7be248ab0947b186f..ac677e9e3df8992d94806bee5913a7247a92f150 100644 (file)
--- a/bindings/ruby/ext/ruby_whisper.c
+++ b/bindings/ruby/ext/ruby_whisper.c
@@ -13,6 +13,7 @@ VALUE cVADSegment;
  VALUE eError;
  
  VALUE cSegment;
+VALUE cToken;
  VALUE cModel;
  
  ID id_to_s;
@@ -37,7 +38,8 @@ extern VALUE ruby_whisper_segment_allocate(VALUE klass);
  extern void init_ruby_whisper_context(VALUE *mWhisper);
  extern void init_ruby_whisper_params(VALUE *mWhisper);
  extern void init_ruby_whisper_error(VALUE *mWhisper);
-extern void init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cSegment);
+extern void init_ruby_whisper_segment(VALUE *mWhisper);
+extern void init_ruby_whisper_token(VALUE *mWhisper);
  extern void init_ruby_whisper_model(VALUE *mWhisper);
  extern void init_ruby_whisper_vad_params(VALUE *mVAD);
  extern void init_ruby_whisper_vad_context(VALUE *mVAD);
@@ -173,7 +175,8 @@ void Init_whisper() {
    init_ruby_whisper_context(&mWhisper);
    init_ruby_whisper_params(&mWhisper);
    init_ruby_whisper_error(&mWhisper);
-  init_ruby_whisper_segment(&mWhisper, &cContext);
+  init_ruby_whisper_segment(&mWhisper);
+  init_ruby_whisper_token(&mWhisper);
    init_ruby_whisper_model(&mWhisper);
    init_ruby_whisper_vad_params(&mVAD);
    init_ruby_whisper_vad_segment(&mVAD);
diff --git a/bindings/ruby/ext/ruby_whisper.h b/bindings/ruby/ext/ruby_whisper.h

index ff8591aaa619ca2525e647d72d8e1715325b3256..3f5660c374dcc31551f4f192c5aaba2a6ea5ac42 100644 (file)
--- a/bindings/ruby/ext/ruby_whisper.h
+++ b/bindings/ruby/ext/ruby_whisper.h
@@ -33,6 +33,11 @@ typedef struct {
    int index;
  } ruby_whisper_segment;
  
+typedef struct {
+  whisper_token_data *token_data;
+  const char *text;
+} ruby_whisper_token;
+
  typedef struct {
    VALUE context;
  } ruby_whisper_model;
@@ -50,4 +55,25 @@ typedef struct {
    struct whisper_vad_context *context;
  } ruby_whisper_vad_context;
  
+#define GetContext(obj, rw) do { \
+  TypedData_Get_Struct((obj), ruby_whisper, &ruby_whisper_type, (rw)); \
+  if ((rw)->context == NULL) { \
+    rb_raise(rb_eRuntimeError, "Not initialized"); \
+  } \
+} while (0)
+
+#define GetToken(obj, rwt) do {                                             \
+  TypedData_Get_Struct((obj), ruby_whisper_token, &ruby_whisper_token_type, (rwt)); \
+  if ((rwt)->token_data == NULL) { \
+    rb_raise(rb_eRuntimeError, "Not initialized"); \
+  } \
+} while (0)
+
+#define GetVADSegments(obj, rwvss) do { \
+  TypedData_Get_Struct((obj), ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, (rwvss)); \
+  if ((rwvss)->segments == NULL) { \
+    rb_raise(rb_eRuntimeError, "Not initialized"); \
+  } \
+} while (0)
+
  #endif
diff --git a/bindings/ruby/ext/ruby_whisper_context.c b/bindings/ruby/ext/ruby_whisper_context.c

index bc0c6e99194a50255e8341a8aa497c6333fe6465..a7b5f8513db56200955fa36257d584e58afa2760 100644 (file)
--- a/bindings/ruby/ext/ruby_whisper_context.c
+++ b/bindings/ruby/ext/ruby_whisper_context.c
@@ -147,7 +147,7 @@ ruby_whisper_initialize(int argc, VALUE *argv, VALUE self)
  VALUE ruby_whisper_model_n_vocab(VALUE self)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    return INT2NUM(whisper_model_n_vocab(rw->context));
  }
  
@@ -158,7 +158,7 @@ VALUE ruby_whisper_model_n_vocab(VALUE self)
  VALUE ruby_whisper_model_n_audio_ctx(VALUE self)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    return INT2NUM(whisper_model_n_audio_ctx(rw->context));
  }
  
@@ -169,7 +169,7 @@ VALUE ruby_whisper_model_n_audio_ctx(VALUE self)
  VALUE ruby_whisper_model_n_audio_state(VALUE self)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    return INT2NUM(whisper_model_n_audio_state(rw->context));
  }
  
@@ -180,7 +180,7 @@ VALUE ruby_whisper_model_n_audio_state(VALUE self)
  VALUE ruby_whisper_model_n_audio_head(VALUE self)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    return INT2NUM(whisper_model_n_audio_head(rw->context));
  }
  
@@ -191,7 +191,7 @@ VALUE ruby_whisper_model_n_audio_head(VALUE self)
  VALUE ruby_whisper_model_n_audio_layer(VALUE self)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    return INT2NUM(whisper_model_n_audio_layer(rw->context));
  }
  
@@ -202,7 +202,7 @@ VALUE ruby_whisper_model_n_audio_layer(VALUE self)
  VALUE ruby_whisper_model_n_text_ctx(VALUE self)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    return INT2NUM(whisper_model_n_text_ctx(rw->context));
  }
  
@@ -213,7 +213,7 @@ VALUE ruby_whisper_model_n_text_ctx(VALUE self)
  VALUE ruby_whisper_model_n_text_state(VALUE self)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    return INT2NUM(whisper_model_n_text_state(rw->context));
  }
  
@@ -224,7 +224,7 @@ VALUE ruby_whisper_model_n_text_state(VALUE self)
  VALUE ruby_whisper_model_n_text_head(VALUE self)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    return INT2NUM(whisper_model_n_text_head(rw->context));
  }
  
@@ -235,7 +235,7 @@ VALUE ruby_whisper_model_n_text_head(VALUE self)
  VALUE ruby_whisper_model_n_text_layer(VALUE self)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    return INT2NUM(whisper_model_n_text_layer(rw->context));
  }
  
@@ -246,7 +246,7 @@ VALUE ruby_whisper_model_n_text_layer(VALUE self)
  VALUE ruby_whisper_model_n_mels(VALUE self)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    return INT2NUM(whisper_model_n_mels(rw->context));
  }
  
@@ -257,7 +257,7 @@ VALUE ruby_whisper_model_n_mels(VALUE self)
  VALUE ruby_whisper_model_ftype(VALUE self)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    return INT2NUM(whisper_model_ftype(rw->context));
  }
  
@@ -268,7 +268,7 @@ VALUE ruby_whisper_model_ftype(VALUE self)
  VALUE ruby_whisper_model_type(VALUE self)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    return rb_str_new2(whisper_model_type_readable(rw->context));
  }
  
@@ -291,7 +291,7 @@ VALUE ruby_whisper_full(int argc, VALUE *argv, VALUE self)
  
    ruby_whisper *rw;
    ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    VALUE params = argv[0];
    TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
    VALUE samples = argv[1];
@@ -377,7 +377,7 @@ ruby_whisper_full_parallel(int argc, VALUE *argv,VALUE self)
  
    ruby_whisper *rw;
    ruby_whisper_params *rwp;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    VALUE params = argv[0];
    TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
    VALUE samples = argv[1];
@@ -463,7 +463,7 @@ static VALUE
  ruby_whisper_full_n_segments(VALUE self)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    return INT2NUM(whisper_full_n_segments(rw->context));
  }
  
@@ -477,7 +477,7 @@ static VALUE
  ruby_whisper_full_lang_id(VALUE self)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    return INT2NUM(whisper_full_lang_id(rw->context));
  }
  
@@ -502,7 +502,7 @@ static VALUE
  ruby_whisper_full_get_segment_t0(VALUE self, VALUE i_segment)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
    const int64_t t0 = whisper_full_get_segment_t0(rw->context, c_i_segment);
    return LONG2NUM(t0);
@@ -520,7 +520,7 @@ static VALUE
  ruby_whisper_full_get_segment_t1(VALUE self, VALUE i_segment)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
    const int64_t t1 = whisper_full_get_segment_t1(rw->context, c_i_segment);
    return LONG2NUM(t1);
@@ -538,7 +538,7 @@ static VALUE
  ruby_whisper_full_get_segment_speaker_turn_next(VALUE self, VALUE i_segment)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
    const bool speaker_turn_next = whisper_full_get_segment_speaker_turn_next(rw->context, c_i_segment);
    return speaker_turn_next ? Qtrue : Qfalse;
@@ -556,7 +556,7 @@ static VALUE
  ruby_whisper_full_get_segment_text(VALUE self, VALUE i_segment)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
    const char * text = whisper_full_get_segment_text(rw->context, c_i_segment);
    return rb_str_new2(text);
@@ -570,7 +570,7 @@ static VALUE
  ruby_whisper_full_get_segment_no_speech_prob(VALUE self, VALUE i_segment)
  {
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    const int c_i_segment = ruby_whisper_full_check_segment_index(rw, i_segment);
    const float no_speech_prob = whisper_full_get_segment_no_speech_prob(rw->context, c_i_segment);
    return DBL2NUM(no_speech_prob);
@@ -611,7 +611,7 @@ ruby_whisper_each_segment(VALUE self)
    }
  
    ruby_whisper *rw;
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
  
    const int n_segments = whisper_full_n_segments(rw->context);
    for (int i = 0; i < n_segments; ++i) {
diff --git a/bindings/ruby/ext/ruby_whisper_model.c b/bindings/ruby/ext/ruby_whisper_model.c

index c6f3351e62296e519aa06ce75812dc3070e105a8..b196a8b5cb5248b93651d670aef653cc283d4cfc 100644 (file)
--- a/bindings/ruby/ext/ruby_whisper_model.c
+++ b/bindings/ruby/ext/ruby_whisper_model.c
@@ -53,7 +53,7 @@ ruby_whisper_model_n_vocab(VALUE self)
    ruby_whisper_model *rwm;
    TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rwm->context, rw);
    return INT2NUM(whisper_model_n_vocab(rw->context));
  }
  
@@ -67,7 +67,7 @@ ruby_whisper_model_n_audio_ctx(VALUE self)
    ruby_whisper_model *rwm;
    TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rwm->context, rw);
    return INT2NUM(whisper_model_n_audio_ctx(rw->context));
  }
  
@@ -81,7 +81,7 @@ ruby_whisper_model_n_audio_state(VALUE self)
    ruby_whisper_model *rwm;
    TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rwm->context, rw);
    return INT2NUM(whisper_model_n_audio_state(rw->context));
  }
  
@@ -95,7 +95,7 @@ ruby_whisper_model_n_audio_head(VALUE self)
    ruby_whisper_model *rwm;
    TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rwm->context, rw);
    return INT2NUM(whisper_model_n_audio_head(rw->context));
  }
  
@@ -109,7 +109,7 @@ ruby_whisper_model_n_audio_layer(VALUE self)
    ruby_whisper_model *rwm;
    TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rwm->context, rw);
    return INT2NUM(whisper_model_n_audio_layer(rw->context));
  }
  
@@ -123,7 +123,7 @@ ruby_whisper_model_n_text_ctx(VALUE self)
    ruby_whisper_model *rwm;
    TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rwm->context, rw);
    return INT2NUM(whisper_model_n_text_ctx(rw->context));
  }
  
@@ -137,7 +137,7 @@ ruby_whisper_model_n_text_state(VALUE self)
    ruby_whisper_model *rwm;
    TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rwm->context, rw);
    return INT2NUM(whisper_model_n_text_state(rw->context));
  }
  
@@ -151,7 +151,7 @@ ruby_whisper_model_n_text_head(VALUE self)
    ruby_whisper_model *rwm;
    TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rwm->context, rw);
    return INT2NUM(whisper_model_n_text_head(rw->context));
  }
  
@@ -165,7 +165,7 @@ ruby_whisper_model_n_text_layer(VALUE self)
    ruby_whisper_model *rwm;
    TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rwm->context, rw);
    return INT2NUM(whisper_model_n_text_layer(rw->context));
  }
  
@@ -179,7 +179,7 @@ ruby_whisper_model_n_mels(VALUE self)
    ruby_whisper_model *rwm;
    TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rwm->context, rw);
    return INT2NUM(whisper_model_n_mels(rw->context));
  }
  
@@ -193,7 +193,7 @@ ruby_whisper_model_ftype(VALUE self)
    ruby_whisper_model *rwm;
    TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rwm->context, rw);
    return INT2NUM(whisper_model_ftype(rw->context));
  }
  
@@ -207,7 +207,7 @@ ruby_whisper_model_type(VALUE self)
    ruby_whisper_model *rwm;
    TypedData_Get_Struct(self, ruby_whisper_model, &rb_whisper_model_type, rwm);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rwm->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rwm->context, rw);
    return rb_str_new2(whisper_model_type_readable(rw->context));
  }
  
diff --git a/bindings/ruby/ext/ruby_whisper_params.c b/bindings/ruby/ext/ruby_whisper_params.c

index 70417cb166449e9e844a6a629e206e7be9da452c..4dfe2575a39d78ee5245e60d8c94318b23040259 100644 (file)
--- a/bindings/ruby/ext/ruby_whisper_params.c
+++ b/bindings/ruby/ext/ruby_whisper_params.c
@@ -428,6 +428,7 @@ ruby_whisper_params_set_print_realtime(VALUE self, VALUE value)
  }
  /*
   * If true, prints results from within whisper.cpp. (avoid it, use callback instead)
+ *
   * call-seq:
   *   print_realtime -> bool
   */
diff --git a/bindings/ruby/ext/ruby_whisper_segment.c b/bindings/ruby/ext/ruby_whisper_segment.c

index c05632c77cb9b44dfc80bdb31e5b75d77d014d47..74221790928d5d37b10c6a1291b51c6a0a25e995 100644 (file)
--- a/bindings/ruby/ext/ruby_whisper_segment.c
+++ b/bindings/ruby/ext/ruby_whisper_segment.c
@@ -1,19 +1,24 @@
  #include <ruby.h>
  #include "ruby_whisper.h"
  
-#define N_KEY_NAMES 5
+#define N_KEY_NAMES 6
  
+extern ID id___method__;
+extern ID id_to_enum;
  static VALUE sym_start_time;
  static VALUE sym_end_time;
  static VALUE sym_text;
  static VALUE sym_no_speech_prob;
  static VALUE sym_speaker_turn_next;
+static VALUE sym_n_tokens;
  static VALUE key_names;
  
  extern const rb_data_type_t ruby_whisper_type;
  
  extern VALUE cSegment;
  
+extern VALUE ruby_whisper_token_s_init(struct whisper_context *context, int i_segment, int index);
+
  static void
  rb_whisper_segment_mark(void *p)
  {
@@ -72,7 +77,7 @@ ruby_whisper_segment_get_start_time(VALUE self)
    ruby_whisper_segment *rws;
    TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rws->context, rw);
    const int64_t t0 = whisper_full_get_segment_t0(rw->context, rws->index);
    // able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
    return LONG2NUM(t0 * 10);
@@ -90,7 +95,7 @@ ruby_whisper_segment_get_end_time(VALUE self)
    ruby_whisper_segment *rws;
    TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rws->context, rw);
    const int64_t t1 = whisper_full_get_segment_t1(rw->context, rws->index);
    // able to multiply 10 without overflow because to_timestamp() in whisper.cpp does it
    return LONG2NUM(t1 * 10);
@@ -108,7 +113,7 @@ ruby_whisper_segment_get_speaker_turn_next(VALUE self)
    ruby_whisper_segment *rws;
    TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rws->context, rw);
    return whisper_full_get_segment_speaker_turn_next(rw->context, rws->index) ? Qtrue : Qfalse;
  }
  
@@ -122,7 +127,7 @@ ruby_whisper_segment_get_text(VALUE self)
    ruby_whisper_segment *rws;
    TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rws->context, rw);
    const char * text = whisper_full_get_segment_text(rw->context, rws->index);
    return rb_str_new2(text);
  }
@@ -137,15 +142,67 @@ ruby_whisper_segment_get_no_speech_prob(VALUE self)
    ruby_whisper_segment *rws;
    TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rws->context, rw);
    return DBL2NUM(whisper_full_get_segment_no_speech_prob(rw->context, rws->index));
  }
  
+/*
+ * Get number of tokens in the segment
+ *
+ * call-seq:
+ *   n_tokens -> Integer
+ */
+static VALUE
+ruby_whisper_segment_get_n_tokens(VALUE self)
+{
+  ruby_whisper_segment *rws;
+  TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
+  ruby_whisper *rw;
+  GetContext(rws->context, rw);
+  return INT2NUM(whisper_full_n_tokens(rw->context, rws->index));
+}
+
+/*
+ * Yields each Whisper::Token:
+ *
+ *   whisper.each_segment.first.each_token do |token|
+ *     p token
+ *   end
+ *
+ * Returns an Enumerator if no block is given:
+ *
+ *   whisper.each_segment.first.each_token.to_a # => [#<Whisper::Token>, ...]
+ *
+ * call-seq:
+ *   each_token {|token| ... }
+ *   each_token -> Enumerator
+ */
+static VALUE
+ruby_whisper_segment_each_token(VALUE self)
+{
+  if (!rb_block_given_p()) {
+    const VALUE method_name = rb_funcall(self, id___method__, 0);
+    return rb_funcall(self, id_to_enum, 1, method_name);
+  }
+
+  ruby_whisper_segment *rws;
+  TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
+  ruby_whisper *rw;
+  GetContext(rws->context, rw);
+
+  const int n_tokens = whisper_full_n_tokens(rw->context, rws->index);
+  for (int i = 0; i < n_tokens; ++i) {
+    rb_yield(ruby_whisper_token_s_init(rw->context, rws->index, i));
+  }
+
+  return self;
+}
+
  /*
   * call-seq:
   *   deconstruct_keys(keys) -> hash
   *
- *  Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
+ *  Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next, :n_tokens
   *
   *   whisper.each_segment do |segment|
   *     segment => {start_time:, end_time:, text:, no_speech_prob:, speaker_turn_next:}
@@ -159,7 +216,7 @@ ruby_whisper_segment_deconstruct_keys(VALUE self, VALUE keys)
    ruby_whisper_segment *rws;
    TypedData_Get_Struct(self, ruby_whisper_segment, &ruby_whisper_segment_type, rws);
    ruby_whisper *rw;
-  TypedData_Get_Struct(rws->context, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(rws->context, rw);
  
    VALUE hash = rb_hash_new();
    long n_keys;
@@ -189,13 +246,16 @@ ruby_whisper_segment_deconstruct_keys(VALUE self, VALUE keys)
      if (key == sym_speaker_turn_next) {
        rb_hash_aset(hash, key, ruby_whisper_segment_get_speaker_turn_next(self));
      }
+    if (key == sym_n_tokens) {
+      rb_hash_aset(hash, key, ruby_whisper_segment_get_n_tokens(self));
+    }
    }
  
    return hash;
  }
  
  void
-init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
+init_ruby_whisper_segment(VALUE *mWhisper)
  {
    cSegment  = rb_define_class_under(*mWhisper, "Segment", rb_cObject);
  
@@ -204,6 +264,7 @@ init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
    sym_text = ID2SYM(rb_intern("text"));
    sym_no_speech_prob = ID2SYM(rb_intern("no_speech_prob"));
    sym_speaker_turn_next = ID2SYM(rb_intern("speaker_turn_next"));
+  sym_n_tokens = ID2SYM(rb_intern("n_tokens"));
    key_names = rb_ary_new3(
      N_KEY_NAMES,
      sym_start_time,
@@ -219,5 +280,8 @@ init_ruby_whisper_segment(VALUE *mWhisper, VALUE *cContext)
    rb_define_method(cSegment, "speaker_turn_next?", ruby_whisper_segment_get_speaker_turn_next, 0);
    rb_define_method(cSegment, "text", ruby_whisper_segment_get_text, 0);
    rb_define_method(cSegment, "no_speech_prob", ruby_whisper_segment_get_no_speech_prob, 0);
+  rb_define_method(cSegment, "n_tokens", ruby_whisper_segment_get_n_tokens, 0);
+  rb_define_method(cSegment, "each_token", ruby_whisper_segment_each_token, 0);
    rb_define_method(cSegment, "deconstruct_keys", ruby_whisper_segment_deconstruct_keys, 1);
  }
+#undef N_KEY_NAMES
diff --git a/bindings/ruby/ext/ruby_whisper_token.c b/bindings/ruby/ext/ruby_whisper_token.c

new file mode 100644 (file)

index 0000000..e0dbb79
--- /dev/null
+++ b/bindings/ruby/ext/ruby_whisper_token.c
@@ -0,0 +1,353 @@
+#include <ruby.h>
+#include "ruby_whisper.h"
+
+#define N_KEY_NAMES 11
+
+extern VALUE cToken;
+extern const rb_data_type_t ruby_whisper_type;
+
+static VALUE key_names;
+static VALUE sym_id;
+static VALUE sym_tid;
+static VALUE sym_probability;
+static VALUE sym_log_probability;
+static VALUE sym_pt;
+static VALUE sym_ptsum;
+static VALUE sym_t_dtw;
+static VALUE sym_voice_length;
+static VALUE sym_start_time;
+static VALUE sym_end_time;
+static VALUE sym_text;
+
+static size_t
+ruby_whisper_token_memsize(const void *p)
+{
+  const ruby_whisper_token *rwt = (const ruby_whisper_token *)p;
+  if (!rwt) {
+    return 0;
+  }
+  return sizeof(rwt);
+}
+
+static const rb_data_type_t ruby_whisper_token_type = {
+  "ruby_whisper_token",
+  {0, RUBY_DEFAULT_FREE, ruby_whisper_token_memsize,},
+  0, 0,
+  0
+};
+
+static VALUE
+ruby_whisper_token_allocate(VALUE klass)
+{
+  ruby_whisper_token *rwt;
+  VALUE token = TypedData_Make_Struct(klass, ruby_whisper_token, &ruby_whisper_token_type, rwt);
+  rwt->token_data = NULL;
+  rwt->text = NULL;
+  return token;
+}
+
+VALUE
+ruby_whisper_token_s_init(struct whisper_context *context, int i_segment, int i_token)
+{
+  whisper_token_data token_data = whisper_full_get_token_data(context, i_segment, i_token);
+  const VALUE token = ruby_whisper_token_allocate(cToken);
+  ruby_whisper_token *rwt;
+  TypedData_Get_Struct(token, ruby_whisper_token, &ruby_whisper_token_type, rwt);
+  rwt->token_data = &token_data;
+  rwt->text = whisper_full_get_token_text(context, i_segment, i_token);
+  return token;
+}
+
+/*
+ * Token ID.
+ *
+ * call-seq:
+ *   id -> Integer
+ */
+static VALUE
+ruby_whisper_token_get_id(VALUE self)
+{
+  ruby_whisper_token *rwt;
+  GetToken(self, rwt);
+  return INT2NUM(rwt->token_data->id);
+}
+
+/*
+ * Forced timestamp token ID.
+ *
+ * call-seq:
+ *   tid -> Integer
+ */
+static VALUE
+ruby_whisper_token_get_tid(VALUE self)
+{
+  ruby_whisper_token *rwt;
+  GetToken(self, rwt);
+  return INT2NUM(rwt->token_data->tid);
+}
+
+/*
+ * Probability of the token.
+ *
+ * call-seq:
+ *   probability -> Float
+ */
+static VALUE
+ruby_whisper_token_get_p(VALUE self)
+{
+  ruby_whisper_token *rwt;
+  GetToken(self, rwt);
+  return DBL2NUM(rwt->token_data->p);
+}
+
+/*
+ * Log probability of the token.
+ *
+ * call-seq:
+ *   log_probability -> Float
+ */
+static VALUE
+ruby_whisper_token_get_plog(VALUE self)
+{
+  ruby_whisper_token *rwt;
+  GetToken(self, rwt);
+  return DBL2NUM(rwt->token_data->plog);
+}
+
+/*
+ * Probability of the timestamp token.
+ *
+ * call-seq:
+ *   pt -> Float
+ */
+static VALUE
+ruby_whisper_token_get_pt(VALUE self)
+{
+  ruby_whisper_token *rwt;
+  GetToken(self, rwt);
+  return DBL2NUM(rwt->token_data->pt);
+}
+
+/*
+ * Sum of probability of all timestamp tokens.
+ *
+ * call-seq:
+ *   ptsum -> Float
+ */
+static VALUE
+ruby_whisper_token_get_ptsum(VALUE self)
+{
+  ruby_whisper_token *rwt;
+  GetToken(self, rwt);
+  return DBL2NUM(rwt->token_data->ptsum);
+}
+
+/*
+ * [EXPERIMENTAL] Token-level timestamps with DTW
+ *
+ * Do not use if you haven't computed token-level timestamps with dtw.
+ * Roughly corresponds to the moment in audio in which the token was output.
+ *
+ * call-seq:
+ *   t_dtw -> Integer
+ */
+static VALUE
+ruby_whisper_token_get_t_dtw(VALUE self)
+{
+  ruby_whisper_token *rwt;
+  GetToken(self, rwt);
+  return LONG2NUM(rwt->token_data->t_dtw);
+}
+
+/*
+ * Voice length of the token.
+ *
+ * call-seq:
+ *   voice_length -> Float
+ */
+static VALUE
+ruby_whisper_token_get_vlen(VALUE self)
+{
+  ruby_whisper_token *rwt;
+  GetToken(self, rwt);
+  return DBL2NUM(rwt->token_data->vlen);
+}
+
+/*
+ * Get the token text of the token.
+ *
+ * call-seq:
+ *   text -> String
+ */
+static VALUE
+ruby_whisper_token_get_text(VALUE self)
+{
+  ruby_whisper_token *rwt;
+  GetToken(self, rwt);
+  return rb_str_new2(rwt->text);
+}
+
+
+/*
+ * Start time of the token.
+ *
+ * Token-level timestamp data.
+ * Do not use if you haven't computed token-level timestamps.
+ *
+ * call-seq:
+ *   start_time -> Integer
+ */
+static VALUE
+ruby_whisper_token_get_start_time(VALUE self)
+{
+  ruby_whisper_token *rwt;
+  GetToken(self, rwt);
+  return LONG2NUM(rwt->token_data->t0 * 10);
+}
+
+/*
+ * End time of the token.
+ *
+ * Token-level timestamp data.
+ * Do not use if you haven't computed token-level timestamps.
+ *
+ * call-seq:
+ *   end_time -> Integer
+ */
+static VALUE
+ruby_whisper_token_get_end_time(VALUE self)
+{
+  ruby_whisper_token *rwt;
+  GetToken(self, rwt);
+  return LONG2NUM(rwt->token_data->t1 * 10);
+}
+
+/*
+ * call-seq:
+ *   deconstruct_keys(keys) -> hash
+ *
+ *  Possible keys: :id, :tid, :probability, :log_probability, :pt, :ptsum,
+ *                 :t_dtw, :voice_length, :start_time, :end_time, :text
+ *    segment.each_token do |token|
+ *      token => {text:, probability:}
+        puts "#{text} (#{probability})"
+ *    end
+ */
+static VALUE ruby_whisper_token_deconstruct_keys(VALUE self, VALUE keys)
+{
+  ruby_whisper_token *rwt;
+  GetToken(self, rwt);
+  VALUE hash = rb_hash_new();
+  long n_keys = 0;
+
+  if (NIL_P(keys)) {
+    keys = key_names;
+    n_keys = N_KEY_NAMES;
+  } else {
+    n_keys = RARRAY_LEN(keys);
+    if (n_keys > N_KEY_NAMES) {
+      return hash;
+    }
+  }
+
+  for (int i = 0; i < n_keys; i++) {
+    VALUE key = rb_ary_entry(keys, i);
+    if (key == sym_start_time) {
+      rb_hash_aset(hash, key, ruby_whisper_token_get_start_time(self));
+      continue;
+    }
+    if (key == sym_end_time) {
+      rb_hash_aset(hash, key, ruby_whisper_token_get_end_time(self));
+      continue;
+    }
+    if (key == sym_text) {
+      rb_hash_aset(hash, key, ruby_whisper_token_get_text(self));
+      continue;
+    }
+    if (key == sym_probability) {
+      rb_hash_aset(hash, key, ruby_whisper_token_get_p(self));
+      continue;
+    }
+    if (key == sym_id) {
+      rb_hash_aset(hash, key, ruby_whisper_token_get_id(self));
+      continue;
+    }
+    if (key == sym_tid) {
+      rb_hash_aset(hash, key, ruby_whisper_token_get_tid(self));
+      continue;
+    }
+    if (key == sym_log_probability) {
+      rb_hash_aset(hash, key, ruby_whisper_token_get_plog(self));
+      continue;
+    }
+    if (key == sym_pt) {
+      rb_hash_aset(hash, key, ruby_whisper_token_get_pt(self));
+      continue;
+    }
+    if (key == sym_ptsum) {
+      rb_hash_aset(hash, key, ruby_whisper_token_get_ptsum(self));
+      continue;
+    }
+    if (key == sym_t_dtw) {
+      rb_hash_aset(hash, key, ruby_whisper_token_get_t_dtw(self));
+      continue;
+    }
+    if (key == sym_voice_length) {
+      rb_hash_aset(hash, key, ruby_whisper_token_get_vlen(self));
+      continue;
+    }
+  }
+
+  return hash;
+}
+
+
+void
+init_ruby_whisper_token(VALUE *mWhisper)
+{
+  cToken = rb_define_class_under(*mWhisper, "Token", rb_cObject);
+
+  rb_define_alloc_func(cToken, ruby_whisper_token_allocate);
+
+  sym_id = ID2SYM(rb_intern("id"));
+  sym_tid = ID2SYM(rb_intern("tid"));
+  sym_probability = ID2SYM(rb_intern("probability"));
+  sym_log_probability = ID2SYM(rb_intern("log_probability"));
+  sym_pt = ID2SYM(rb_intern("pt"));
+  sym_ptsum = ID2SYM(rb_intern("ptsum"));
+  sym_t_dtw = ID2SYM(rb_intern("t_dtw"));
+  sym_voice_length = ID2SYM(rb_intern("voice_length"));
+  sym_start_time = ID2SYM(rb_intern("start_time"));
+  sym_end_time = ID2SYM(rb_intern("end_time"));
+  sym_text = ID2SYM(rb_intern("text"));
+  key_names = rb_ary_new3(
+    N_KEY_NAMES,
+    sym_id,
+    sym_tid,
+    sym_probability,
+    sym_log_probability,
+    sym_pt,
+    sym_ptsum,
+    sym_t_dtw,
+    sym_voice_length,
+    sym_start_time,
+    sym_end_time,
+    sym_text
+  );
+
+  rb_define_method(cToken, "id", ruby_whisper_token_get_id, 0);
+  rb_define_method(cToken, "tid", ruby_whisper_token_get_tid, 0);
+  rb_define_method(cToken, "probability", ruby_whisper_token_get_p, 0);
+  rb_define_method(cToken, "log_probability", ruby_whisper_token_get_plog, 0);
+  rb_define_method(cToken, "pt", ruby_whisper_token_get_pt, 0);
+  rb_define_method(cToken, "ptsum", ruby_whisper_token_get_ptsum, 0);
+  rb_define_method(cToken, "t_dtw", ruby_whisper_token_get_t_dtw, 0);
+  rb_define_method(cToken, "voice_length", ruby_whisper_token_get_vlen, 0);
+  rb_define_method(cToken, "start_time", ruby_whisper_token_get_start_time, 0);
+  rb_define_method(cToken, "end_time", ruby_whisper_token_get_end_time, 0);
+  rb_define_method(cToken, "text", ruby_whisper_token_get_text, 0);
+
+  rb_define_method(cToken, "deconstruct_keys", ruby_whisper_token_deconstruct_keys, 1);
+}
+
+#undef N_KEY_NAMES
diff --git a/bindings/ruby/ext/ruby_whisper_transcribe.cpp b/bindings/ruby/ext/ruby_whisper_transcribe.cpp

index dc64af00808fdb6d8bedbda6759bbb4ab293e984..594b2db90e3b33635180ab909c0850fe6c810bfe 100644 (file)
--- a/bindings/ruby/ext/ruby_whisper_transcribe.cpp
+++ b/bindings/ruby/ext/ruby_whisper_transcribe.cpp
@@ -43,7 +43,7 @@ ruby_whisper_transcribe(int argc, VALUE *argv, VALUE self) {
  
    int n_processors = opts[0] == Qundef ? 1 : NUM2INT(opts[0]);
  
-  TypedData_Get_Struct(self, ruby_whisper, &ruby_whisper_type, rw);
+  GetContext(self, rw);
    TypedData_Get_Struct(params, ruby_whisper_params, &ruby_whisper_params_type, rwp);
  
    if (!rb_respond_to(wave_file_path, id_to_s)) {
diff --git a/bindings/ruby/ext/ruby_whisper_vad_segments.c b/bindings/ruby/ext/ruby_whisper_vad_segments.c

index ae1c21b66135ab84b5434702f9da4c0093920361..1bb375937a43e3a556532d259bdf40d31c2ec1b8 100644 (file)
--- a/bindings/ruby/ext/ruby_whisper_vad_segments.c
+++ b/bindings/ruby/ext/ruby_whisper_vad_segments.c
@@ -74,10 +74,7 @@ ruby_whisper_vad_segments_each(VALUE self)
      return rb_funcall(self, id_to_enum, 1, method_name);
    }
  
-  TypedData_Get_Struct(self, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
-  if (rwvss->segments == NULL) {
-    rb_raise(rb_eRuntimeError, "Doesn't have reference to segments internally");
-  }
+  GetVADSegments(self, rwvss);
    n_segments = whisper_vad_segments_n_segments(rwvss->segments);
    for (i = 0; i < n_segments; ++i) {
      rb_yield(rb_whisper_vad_segment_s_new(self, i));
@@ -92,10 +89,7 @@ ruby_whisper_vad_segments_get_length(VALUE self)
    ruby_whisper_vad_segments *rwvss;
    int n_segments;
  
-  TypedData_Get_Struct(self, ruby_whisper_vad_segments, &ruby_whisper_vad_segments_type, rwvss);
-  if (rwvss->segments == NULL) {
-    rb_raise(rb_eRuntimeError, "Doesn't have reference to segments internally");
-  }
+  GetVADSegments(self, rwvss);
    n_segments = whisper_vad_segments_n_segments(rwvss->segments);
  
    return INT2NUM(n_segments);
diff --git a/bindings/ruby/lib/whisper/model/uri.rb b/bindings/ruby/lib/whisper/model/uri.rb

index 765f78652c2f9b620c90e6343df7f460025b6d16..8eb57e5e8cf270c4433e822a22346f4ee0f94d2c 100644 (file)
--- a/bindings/ruby/lib/whisper/model/uri.rb
+++ b/bindings/ruby/lib/whisper/model/uri.rb
@@ -182,7 +182,6 @@ module Whisper
        base-q8_0
        small
        small.en
-      small.en-tdrz
        small-q5_1
        small.en-q5_1
        small-q8_0
@@ -204,6 +203,12 @@ module Whisper
        models[name] = URI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}.bin")
      }
  
+    %w[
+      small.en-tdrz
+    ].each do |name|
+      @pre_converted_models[name] = URI.new("https://huggingface.co/akashmjn/tinydiarize-whisper.cpp/resolve/main/ggml-#{name}.bin")
+    end
+
      %w[
        silero-v5.1.2
        silero-v6.2.0
@@ -211,22 +216,14 @@ module Whisper
        @pre_converted_models[name] = URI.new("https://huggingface.co/ggml-org/whisper-vad/resolve/main/ggml-#{name}.bin")
      end
  
-    @coreml_compiled_models = %w[
-      tiny
-      tiny.en
-      base
-      base.en
-      small
-      small.en
-      medium
-      medium.en
-      large-v1
-      large-v2
-      large-v3
-      large-v3-turbo
-    ].each_with_object({}) do |name, models|
-      models[@pre_converted_models[name]] = ZipURI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}-encoder.mlmodelc.zip")
-    end
+    @coreml_compiled_models = @pre_converted_models.each_with_object({}) {|(name, uri), models|
+      next if name.end_with?("-tdrz") || name.start_with?("silero-")
+
+      if matched = name.match(/\A(?<name>.*)-q\d_\d\z/)
+        name = matched[:name]
+      end
+      models[uri] = ZipURI.new("https://huggingface.co/ggerganov/whisper.cpp/resolve/main/ggml-#{name}-encoder.mlmodelc.zip")
+    }
  
      class << self
        attr_reader :pre_converted_models, :coreml_compiled_models
diff --git a/bindings/ruby/sig/whisper.rbs b/bindings/ruby/sig/whisper.rbs

index dcb387a25a9494f41dc2868f75a939f9705c6b14..1137e3f36abcb85cf4b889ffa947e6a405d519e8 100644 (file)
--- a/bindings/ruby/sig/whisper.rbs
+++ b/bindings/ruby/sig/whisper.rbs
@@ -434,7 +434,8 @@ module Whisper
        end_time: (Integer | nil),
        text: (String | nil),
        no_speech_prob: (Float | nil),
-      speaker_turn_next: (true | false | nil)
+      speaker_turn_next: (true | false | nil),
+      n_tokens: (Integer | nil)
      }
  
      # Start time in milliseconds.
@@ -446,13 +447,32 @@ module Whisper
      def end_time: () -> Integer
  
      # Whether the next segment is predicted as a speaker turn.
+    #
      def speaker_turn_next?: () -> (true | false)
  
      def text: () -> String
      def no_speech_prob: () -> Float
+
+    # Get number of tokens in the segment
+    #
+    def n_tokens: () -> Integer
+
+    # Yields each Whisper::Token:
+    #
+    #   whisper.each_segment.first.each_token do |token|
+    #     p token
+    #   end
+    #
+    # Returns an Enumerator if no block is given:
+    #
+    #   whisper.each_segment.first.each_token.to_a # => [#<Whisper::Token>, ...]
+    #
+    def each_token: { (Token) -> void } -> void
+                  | () -> Enumerator[Token]
      def to_srt_cue: () -> String
      def to_webvtt_cue: () -> String
  
+
      #  Possible keys: :start_time, :end_time, :text, :no_speech_prob, :speaker_turn_next
      #
      #      whisper.each_segment do |segment|
@@ -460,7 +480,77 @@ module Whisper
      #
      #        puts "[#{start_time} --> #{end_time}] #{text} (no speech prob: #{no_speech_prob}#{speaker_turn_next ? ', speaker turns next' : ''})"
      #      end
-    def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next] | nil) -> deconstructed_keys
+    def deconstruct_keys: (Array[:start_time | :end_time | :text | :no_speech_prob | :speaker_turn_next | :n_tokens] | nil) -> deconstructed_keys
+  end
+
+  module Token
+    type deconstructed_keys = {
+      id: (Integer | nil),
+      tid: (Integer | nil),
+      probability: (Float | nil),
+      log_probability: (Float | nil),
+      pt: (Float | nil),
+      ptsum: (Float | nil),
+      t_dtw: (Integer | nil),
+      voice_length: (Float | nil),
+      text: (String | nil),
+      start_time: (Integer | nil),
+      end_time: (Integer | nil),
+    }
+
+    # Token ID.
+    #
+    def id: () -> Integer
+
+    # Forced timestamp token ID.
+    #
+    def tid: () -> Integer
+
+    # Probability of the token.
+    #
+    def probability: () -> Float
+
+    # Log probability of the token.
+    #
+    def log_probability: () -> Float
+
+    # Probability of the timestamp token.
+    #
+    def pt: () -> Float
+
+    # Sum of probability of all timestamp tokens.
+    #
+    def ptsum: () -> Float
+
+    # [EXPERIMENTAL] Token-level timestamps with DTW
+    #
+    # Do not use if you haven't computed token-level timestamps with dtw.
+    # Roughly corresponds to the moment in audio in which the token was output.
+    #
+    def t_dtw: () -> Integer
+
+    # Voice length of the token.
+    #
+    def voice_length: () -> Float
+
+    # Start time of the token.
+    #
+    # Token-level timestamp data.
+    # Do not use if you haven't computed token-level timestamps.
+    #
+    def start_time: () -> Integer
+
+    # End time of the token.
+    #
+    # Token-level timestamp data.
+    # Do not use if you haven't computed token-level timestamps.
+    #
+    def end_time: () -> Integer
+
+    # Get the token text of the token.
+    #
+    def text: () -> String
+    def deconstruct_keys: (Array[:id | :tid | :probability | :log_probability | :pt | :ptsum | :t_dtw | :voice_length | :start_time | :end_time | :text] | nil) -> deconstructed_keys
    end
  
    module VAD
diff --git a/bindings/ruby/test/test_token.rb b/bindings/ruby/test/test_token.rb

new file mode 100644 (file)

index 0000000..214b355
--- /dev/null
+++ b/bindings/ruby/test/test_token.rb
@@ -0,0 +1,68 @@
+require_relative "helper"
+
+class TestToken < TestBase
+  def setup
+    @segment = whisper.each_segment.first
+    @token = @segment.each_token.first
+  end
+
+  def test_n_tokens
+    assert_equal 27, @segment.n_tokens
+  end
+
+  def test_allocate
+    token = Whisper::Token.allocate
+    assert_raise  do
+      token.id
+    end
+  end
+
+  def test_each_token
+    i = 0
+    @segment.each_token do |token|
+      i += 1
+      assert_instance_of Whisper::Token, token
+    end
+    assert_equal 27, i
+  end
+
+  def test_each_token_without_block
+    assert_instance_of Enumerator, @segment.each_token
+  end
+
+  def test_token
+    assert_instance_of Whisper::Token, @token
+
+    assert_instance_of Integer, @token.id
+    assert_instance_of Float, @token.probability
+    assert_instance_of Float, @token.log_probability
+
+    assert_instance_of Integer, @token.tid
+    assert_instance_of Float, @token.pt
+    assert_instance_of Float, @token.ptsum
+
+    assert_instance_of Integer, @token.start_time
+    assert_instance_of Integer, @token.end_time
+
+    assert_instance_of Integer, @token.t_dtw
+
+    assert_instance_of Float, @token.voice_length
+
+    assert_instance_of String, @token.text
+  end
+
+  def test_text
+    assert_equal ["[_BEG_]", " And", " so", " my", " fellow", " Americans", ",", " ask", " not", " what", " your", " country", " can", " do", " for", " you", ",", " ask", " what", " you", " can", " do", " for", " your", " country", ".", "[_TT_550]"],
+                 @segment.each_token.collect(&:text)
+  end
+
+  def test_deconstruct_keys_with_nil
+    assert_equal({}, @token.deconstruct_keys(nil))
+  end
+
+  def test_deconstruct_keys_with_keys
+    keys = %i[id tid probability log_probability pt ptsum t_dtw voice_length start_time end_time text]
+    expected = keys.collect {|key| [key, @token.send(key)] }.to_h
+    assert_equal expected, @token.deconstruct_keys(keys)
+  end
+end
diff --git a/bindings/ruby/test/test_whisper.rb b/bindings/ruby/test/test_whisper.rb

index 23479b7ae7a2f153ff464bd1e91b98cd7f449e65..96e248aca3a9513bc1606fcfc2e0a3503bc796e6 100644 (file)
--- a/bindings/ruby/test/test_whisper.rb
+++ b/bindings/ruby/test/test_whisper.rb
@@ -149,6 +149,13 @@ class TestWhisper < TestBase
      $stderr = stderr
    end
  
+  def test_access_attribute_without_initialization
+    whisper = Whisper::Context.allocate
+    assert_raise do
+      whisper.model_type
+    end
+  end
+
    sub_test_case "full" do
      def setup
        super
author	KITAITI Makoto <redacted>
	Wed, 24 Dec 2025 07:52:16 +0000 (16:52 +0900)
committer	GitHub <redacted>
	Wed, 24 Dec 2025 07:52:16 +0000 (16:52 +0900)
bindings/ruby/README.md		patch \| blob \| history
bindings/ruby/ext/extconf.rb		patch \| blob \| history
bindings/ruby/ext/ruby_whisper.c		patch \| blob \| history
bindings/ruby/ext/ruby_whisper.h		patch \| blob \| history
bindings/ruby/ext/ruby_whisper_context.c		patch \| blob \| history
bindings/ruby/ext/ruby_whisper_model.c		patch \| blob \| history
bindings/ruby/ext/ruby_whisper_params.c		patch \| blob \| history
bindings/ruby/ext/ruby_whisper_segment.c		patch \| blob \| history
bindings/ruby/ext/ruby_whisper_token.c	[new file with mode: 0644]	patch \| blob
bindings/ruby/ext/ruby_whisper_transcribe.cpp		patch \| blob \| history
bindings/ruby/ext/ruby_whisper_vad_segments.c		patch \| blob \| history
bindings/ruby/lib/whisper/model/uri.rb		patch \| blob \| history
bindings/ruby/sig/whisper.rbs		patch \| blob \| history
bindings/ruby/test/test_token.rb	[new file with mode: 0644]	patch \| blob
bindings/ruby/test/test_whisper.rb		patch \| blob \| history