whisper.objc : add real-time processing (#97)

author Georgi Gerganov <redacted>

Sat, 26 Nov 2022 15:28:28 +0000 (17:28 +0200)

committer Georgi Gerganov <redacted>

Sat, 26 Nov 2022 16:32:46 +0000 (18:32 +0200)
author Georgi Gerganov <redacted>
Sat, 26 Nov 2022 15:28:28 +0000 (17:28 +0200)
committer Georgi Gerganov <redacted>
Sat, 26 Nov 2022 16:32:46 +0000 (18:32 +0200)
diff --git a/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard b/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard

index 5c92ba8b09f8927a3bc55fa4327c197e244b02e4..065ccac2ef09c70dc389570aff24f9ca4d750fcb 100644 (file)
--- a/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard
+++ b/examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard
@@ -1,8 +1,8 @@
  <?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21225" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21507" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
      <device id="retina6_0" orientation="portrait" appearance="light"/>
      <dependencies>
-        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21207"/>
+        <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21505"/>
          <capability name="Safe area layout guides" minToolsVersion="9.0"/>
          <capability name="System colors in document resources" minToolsVersion="11.0"/>
          <capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
@@ -40,7 +40,7 @@
                                  <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
                                  <color key="backgroundColor" systemColor="systemBackgroundColor"/>
                                  <color key="textColor" systemColor="labelColor"/>
-                                <fontDescription key="fontDescription" type="system" pointSize="20"/>
+                                <fontDescription key="fontDescription" name="Georgia" family="Georgia" pointSize="16"/>
                                  <textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
                              </textView>
                              <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
@@ -56,6 +56,18 @@
                                      <action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
                                  </connections>
                              </button>
+                            <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="AaW-T2-Ndw">
+                                <rect key="frame" x="199" y="191" width="156" height="49"/>
+                                <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+                                <color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
+                                <color key="tintColor" systemColor="opaqueSeparatorColor"/>
+                                <state key="normal" title="Real-time">
+                                    <color key="titleColor" systemColor="labelColor"/>
+                                </state>
+                                <connections>
+                                    <action selector="onRealtime:" destination="BYZ-38-t0r" eventType="touchUpInside" id="nhn-jT-aQJ"/>
+                                </connections>
+                            </button>
                          </subviews>
                          <viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
                          <color key="backgroundColor" systemColor="systemBackgroundColor"/>
@@ -64,6 +76,7 @@
                          </constraints>
                      </view>
                      <connections>
+                        <outlet property="buttonRealtime" destination="AaW-T2-Ndw" id="gcU-Ol-BOo"/>
                          <outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
                          <outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
                          <outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>
diff --git a/examples/whisper.objc/whisper.objc/ViewController.h b/examples/whisper.objc/whisper.objc/ViewController.h

index 3595518c2727e8d2c1ff70667e14b26a882258b6..e32a326e0b0305e26366c76a1a522807b55f552c 100644 (file)
--- a/examples/whisper.objc/whisper.objc/ViewController.h
+++ b/examples/whisper.objc/whisper.objc/ViewController.h
@@ -20,6 +20,8 @@ typedef struct
  {
      int ggwaveId;
      bool isCapturing;
+    bool isTranscribing;
+    bool isRealtime;
      UILabel * labelReceived;
  
      AudioQueueRef queue;
@@ -31,6 +33,8 @@ typedef struct
      float   * audioBufferF32;
  
      struct whisper_context * ctx;
+
+    void * vc;
  } StateInp;
  
  @interface ViewController : UIViewController
diff --git a/examples/whisper.objc/whisper.objc/ViewController.m b/examples/whisper.objc/whisper.objc/ViewController.m

index 4804471e81a2753142d72bb2ae3ab9457d295589..d294178bcbb9a8f362659a3882c0668b49d6c964 100644 (file)
--- a/examples/whisper.objc/whisper.objc/ViewController.m
+++ b/examples/whisper.objc/whisper.objc/ViewController.m
@@ -24,6 +24,7 @@ void AudioInputCallback(void * inUserData,
  @property (weak, nonatomic) IBOutlet UILabel    *labelStatusInp;
  @property (weak, nonatomic) IBOutlet UIButton   *buttonToggleCapture;
  @property (weak, nonatomic) IBOutlet UIButton   *buttonTranscribe;
+@property (weak, nonatomic) IBOutlet UIButton   *buttonRealtime;
  @property (weak, nonatomic) IBOutlet UITextView *textviewResult;
  
  @end
@@ -77,6 +78,9 @@ void AudioInputCallback(void * inUserData,
          stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
          stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
      }
+
+    stateInp.isTranscribing = false;
+    stateInp.isRealtime = false;
  }
  
  -(IBAction) stopCapturing {
@@ -109,6 +113,7 @@ void AudioInputCallback(void * inUserData,
      NSLog(@"Start capturing");
  
      stateInp.n_samples = 0;
+    stateInp.vc = (__bridge void *)(self);
  
      OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
                                           AudioInputCallback,
@@ -141,67 +146,101 @@ void AudioInputCallback(void * inUserData,
  - (IBAction)onTranscribePrepare:(id)sender {
      _textviewResult.text = @"Processing - please wait ...";
  
+    if (stateInp.isRealtime) {
+        [self onRealtime:(id)sender];
+    }
+
      if (stateInp.isCapturing) {
-        // stop capturing
          [self stopCapturing];
+    }
+}
  
-        return;
+- (IBAction)onRealtime:(id)sender {
+    stateInp.isRealtime = !stateInp.isRealtime;
+
+    if (stateInp.isRealtime) {
+        [_buttonRealtime setBackgroundColor:[UIColor greenColor]];
+    } else {
+        [_buttonRealtime setBackgroundColor:[UIColor grayColor]];
      }
+
+    NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF");
  }
  
  - (IBAction)onTranscribe:(id)sender {
+    if (stateInp.isTranscribing) {
+        return;
+    }
+
      NSLog(@"Processing %d samples", stateInp.n_samples);
  
-    // process captured audio
-    // convert I16 to F32
-    for (int i = 0; i < stateInp.n_samples; i++) {
-        stateInp.audioBufferF32[i] = (float)stateInp.audioBufferI16[i] / 32768.0f;
-    }
+    stateInp.isTranscribing = true;
+
+    // dispatch the model to a background thread
+    dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+        // process captured audio
+        // convert I16 to F32
+        for (int i = 0; i < self->stateInp.n_samples; i++) {
+            self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
+        }
  
-    // run the model
-    struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+        // run the model
+        struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
  
-    params.print_realtime   = true;
-    params.print_progress   = false;
-    params.print_timestamps = true;
-    params.print_special    = false;
-    params.translate        = false;
-    params.language         = "en";
-    params.n_threads        = 4;
-    params.offset_ms        = 0;
+        // get maximum number of threads on this device (max 8)
+        const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);
  
-    CFTimeInterval startTime = CACurrentMediaTime();
+        params.print_realtime   = true;
+        params.print_progress   = false;
+        params.print_timestamps = true;
+        params.print_special    = false;
+        params.translate        = false;
+        params.language         = "en";
+        params.n_threads        = max_threads;
+        params.offset_ms        = 0;
+        params.single_segment   = self->stateInp.isRealtime;
  
-    if (whisper_full(stateInp.ctx, params, stateInp.audioBufferF32, stateInp.n_samples) != 0) {
-        NSLog(@"Failed to run the model");
-        _textviewResult.text = @"Failed to run the model";
+        CFTimeInterval startTime = CACurrentMediaTime();
  
-        return;
-    }
+        whisper_reset_timings(self->stateInp.ctx);
  
-    CFTimeInterval endTime = CACurrentMediaTime();
+        if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) {
+            NSLog(@"Failed to run the model");
+            self->_textviewResult.text = @"Failed to run the model";
  
-    // clear the text in the textview
-    _textviewResult.text = @"";
+            return;
+        }
  
-    int n_segments = whisper_full_n_segments(stateInp.ctx);
-    for (int i = 0; i < n_segments; i++) {
-        const char * text_cur = whisper_full_get_segment_text(stateInp.ctx, i);
+        whisper_print_timings(self->stateInp.ctx);
  
-        // append the text to the textview
-        _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
-    }
+        CFTimeInterval endTime = CACurrentMediaTime();
+
+        NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads);
  
-    // internal model timing
-    whisper_print_timings(stateInp.ctx);
+        // result text
+        NSString *result = @"";
  
-    NSLog(@"\nProcessing time: %5.3f", endTime - startTime);
+        int n_segments = whisper_full_n_segments(self->stateInp.ctx);
+        for (int i = 0; i < n_segments; i++) {
+            const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i);
  
-    _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
+            // append the text to the result
+            result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
+        }
+
+        // append processing time
+        result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
+
+        // dispatch the result to the main thread
+        dispatch_async(dispatch_get_main_queue(), ^{
+            self->_textviewResult.text = result;
+            self->stateInp.isTranscribing = false;
+        });
+    });
  }
  
  //
-// Callback implmentation
+// Callback implementation
  //
  
  void AudioInputCallback(void * inUserData,
@@ -224,6 +263,12 @@ void AudioInputCallback(void * inUserData,
  
      if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
          NSLog(@"Too much audio data, ignoring");
+
+        dispatch_async(dispatch_get_main_queue(), ^{
+            ViewController * vc = (__bridge ViewController *)(stateInp->vc);
+            [vc stopCapturing];
+        });
+
          return;
      }
  
@@ -235,6 +280,14 @@ void AudioInputCallback(void * inUserData,
  
      // put the buffer back in the queue
      AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
+
+    if (stateInp->isRealtime) {
+        // dipatch onTranscribe() to the main thread
+        dispatch_async(dispatch_get_main_queue(), ^{
+            ViewController * vc = (__bridge ViewController *)(stateInp->vc);
+            [vc onTranscribe:nil];
+        });
+    }
  }
  
  @end
diff --git a/whisper.cpp b/whisper.cpp

index 9e27ab13e86072e24452a2f146ee64ef1494d0ce..2daf41165d704c91e6fdba7490cbfb34592d014e 100644 (file)
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -2386,6 +2386,21 @@ void whisper_reset_timings(struct whisper_context * ctx) {
      ctx->t_decode_us = 0;
  }
  
+const char * whisper_print_system_info(void) {
+    static std::string s;
+
+    s  = "";
+    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
+    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
+    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
+    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
+    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
+    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
+    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
+
+    return s.c_str();
+}
+
  ////////////////////////////////////////////////////////////////////////////
  
  struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
@@ -2863,7 +2878,7 @@ int whisper_full_parallel(
          struct whisper_full_params params,
          const float * samples,
          int n_samples,
-        const int n_processors) {
+        int n_processors) {
      if (n_processors == 1) {
          return whisper_full(ctx, params, samples, n_samples);
      }
@@ -3040,21 +3055,6 @@ float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int
      return ctx->result_all[i_segment].tokens[i_token].p;
  }
  
-const char * whisper_print_system_info(void) {
-    static std::string s;
-
-    s  = "";
-    s += "AVX = "       + std::to_string(ggml_cpu_has_avx())       + " | ";
-    s += "AVX2 = "      + std::to_string(ggml_cpu_has_avx2())      + " | ";
-    s += "AVX512 = "    + std::to_string(ggml_cpu_has_avx512())    + " | ";
-    s += "NEON = "      + std::to_string(ggml_cpu_has_neon())      + " | ";
-    s += "FP16_VA = "   + std::to_string(ggml_cpu_has_fp16_va())   + " | ";
-    s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
-    s += "BLAS = "      + std::to_string(ggml_cpu_has_blas())      + " | ";
-
-    return s.c_str();
-}
-
  // =================================================================================================
  
  //
diff --git a/whisper.h b/whisper.h

index b0fb2d9ac28c920f97af8f200b3f44c4dbb2b4ed..4b5fbccd4e3c0e35b6ddddef3ad3ac5b9cee6d57 100644 (file)
--- a/whisper.h
+++ b/whisper.h
@@ -72,16 +72,16 @@ extern "C" {
          whisper_token id;  // token id
          whisper_token tid; // forced timestamp token id
  
-        float p;     // probability of the token
-        float pt;    // probability of the timestamp token
-        float ptsum; // sum of probabilities of all timestamp tokens
+        float p;           // probability of the token
+        float pt;          // probability of the timestamp token
+        float ptsum;       // sum of probabilities of all timestamp tokens
  
          // token-level timestamp data
          // do not use if you haven't computed token-level timestamps
-        int64_t t0; // start time of the token
-        int64_t t1; //   end time of the token
+        int64_t t0;        // start time of the token
+        int64_t t1;        //   end time of the token
  
-        float vlen; // voice length of the token
+        float vlen;        // voice length of the token
      } whisper_token_data;
  
      // Allocates all memory needed for the model and loads the model from the given file.
@@ -96,9 +96,9 @@ extern "C" {
      // Returns 0 on success
      WHISPER_API int whisper_pcm_to_mel(
              struct whisper_context * ctx,
-            const float * samples,
-            int n_samples,
-            int n_threads);
+                       const float * samples,
+                               int   n_samples,
+                               int   n_threads);
  
      // This can be used to set a custom log mel spectrogram inside the provided whisper context.
      // Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
@@ -106,9 +106,9 @@ extern "C" {
      // Returns 0 on success
      WHISPER_API int whisper_set_mel(
              struct whisper_context * ctx,
-            const float * data,
-            int n_len,
-            int n_mel);
+                       const float * data,
+                               int   n_len,
+                               int   n_mel);
  
      // Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
      // Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
@@ -116,8 +116,8 @@ extern "C" {
      // Returns 0 on success
      WHISPER_API int whisper_encode(
              struct whisper_context * ctx,
-            int offset,
-            int n_threads);
+                               int   offset,
+                               int   n_threads);
  
      // Run the Whisper decoder to obtain the logits and probabilities for the next token.
      // Make sure to call whisper_encode() first.
@@ -126,10 +126,10 @@ extern "C" {
      // Returns 0 on success
      WHISPER_API int whisper_decode(
              struct whisper_context * ctx,
-            const whisper_token * tokens,
-            int n_tokens,
-            int n_past,
-            int n_threads);
+               const whisper_token * tokens,
+                               int   n_tokens,
+                               int   n_past,
+                               int   n_threads);
  
      // Token sampling methods.
      // These are provided for convenience and can be used after each call to whisper_decode().
@@ -169,6 +169,9 @@ extern "C" {
      WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
      WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
  
+    // Print system information
+    WHISPER_API const char * whisper_print_system_info(void);
+
      ////////////////////////////////////////////////////////////////////////////
  
      // Available sampling strategies
@@ -187,12 +190,12 @@ extern "C" {
  
          int n_threads;
          int n_max_text_ctx;
-        int offset_ms;      // start offset in ms
-        int duration_ms;    // audio duration to process in ms
+        int offset_ms;          // start offset in ms
+        int duration_ms;        // audio duration to process in ms
  
          bool translate;
          bool no_context;
-        bool single_segment; // force single segment output (useful for streaming)
+        bool single_segment;    // force single segment output (useful for streaming)
          bool print_special;
          bool print_progress;
          bool print_realtime;
@@ -206,8 +209,8 @@ extern "C" {
          int   max_tokens;       // max tokens per segment (0 = no limit)
  
          // [EXPERIMENTAL] speed-up techniques
-        bool speed_up;  // speed-up the audio by 2x using Phase Vocoder
-        int  audio_ctx; // overwrite the audio context size (0 = use default)
+        bool speed_up;          // speed-up the audio by 2x using Phase Vocoder
+        int  audio_ctx;         // overwrite the audio context size (0 = use default)
  
          // tokens to provide the whisper model as initial prompt
          // these are prepended to any existing text context from a previous call
@@ -235,20 +238,20 @@ extern "C" {
      // Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
      // Uses the specified decoding strategy to obtain the text.
      WHISPER_API int whisper_full(
-            struct whisper_context * ctx,
-            struct whisper_full_params params,
-            const float * samples,
-            int n_samples);
+                struct whisper_context * ctx,
+            struct whisper_full_params   params,
+                           const float * samples,
+                                   int   n_samples);
  
      // Split the input audio in chunks and process each chunk separately using whisper_full()
      // It seems this approach can offer some speedup in some cases.
      // However, the transcription accuracy can be worse at the beginning and end of each chunk.
      WHISPER_API int whisper_full_parallel(
-            struct whisper_context * ctx,
-            struct whisper_full_params params,
-            const float * samples,
-            int n_samples,
-            const int n_processors);
+                struct whisper_context * ctx,
+            struct whisper_full_params   params,
+                           const float * samples,
+                                   int   n_samples,
+                                   int   n_processors);
  
      // Number of generated text segments.
      // A segment can be a few words, a sentence, or even a paragraph.
@@ -275,9 +278,6 @@ extern "C" {
      // Get the probability of the specified token in the specified segment.
      WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
  
-    // Print system information
-    WHISPER_API const char * whisper_print_system_info(void);
-
  #ifdef __cplusplus
  }
  #endif
author	Georgi Gerganov <redacted>
	Sat, 26 Nov 2022 15:28:28 +0000 (17:28 +0200)
committer	Georgi Gerganov <redacted>
	Sat, 26 Nov 2022 16:32:46 +0000 (18:32 +0200)
examples/whisper.objc/whisper.objc/Base.lproj/Main.storyboard		patch \| blob \| history
examples/whisper.objc/whisper.objc/ViewController.h		patch \| blob \| history
examples/whisper.objc/whisper.objc/ViewController.m		patch \| blob \| history
whisper.cpp		patch \| blob \| history
whisper.h		patch \| blob \| history