<?xml version="1.0" encoding="UTF-8"?>
-<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21225" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
+<document type="com.apple.InterfaceBuilder3.CocoaTouch.Storyboard.XIB" version="3.0" toolsVersion="21507" targetRuntime="iOS.CocoaTouch" propertyAccessControl="none" useAutolayout="YES" useTraitCollections="YES" useSafeAreas="YES" colorMatched="YES" initialViewController="BYZ-38-t0r">
<device id="retina6_0" orientation="portrait" appearance="light"/>
<dependencies>
- <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21207"/>
+ <plugIn identifier="com.apple.InterfaceBuilder.IBCocoaTouchPlugin" version="21505"/>
<capability name="Safe area layout guides" minToolsVersion="9.0"/>
<capability name="System colors in document resources" minToolsVersion="11.0"/>
<capability name="documents saved in the Xcode 8 format" minToolsVersion="8.0"/>
<autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
<color key="backgroundColor" systemColor="systemBackgroundColor"/>
<color key="textColor" systemColor="labelColor"/>
- <fontDescription key="fontDescription" type="system" pointSize="20"/>
+ <fontDescription key="fontDescription" name="Georgia" family="Georgia" pointSize="16"/>
<textInputTraits key="textInputTraits" autocapitalizationType="sentences"/>
</textView>
<button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="Brs-xi-o8i">
<action selector="onTranscribePrepare:" destination="BYZ-38-t0r" eventType="touchDown" id="16T-dN-dfB"/>
</connections>
</button>
+ <button opaque="NO" contentMode="scaleToFill" contentHorizontalAlignment="center" contentVerticalAlignment="center" lineBreakMode="middleTruncation" id="AaW-T2-Ndw">
+ <rect key="frame" x="199" y="191" width="156" height="49"/>
+ <autoresizingMask key="autoresizingMask" flexibleMaxX="YES" flexibleMaxY="YES"/>
+ <color key="backgroundColor" systemColor="opaqueSeparatorColor"/>
+ <color key="tintColor" systemColor="opaqueSeparatorColor"/>
+ <state key="normal" title="Real-time">
+ <color key="titleColor" systemColor="labelColor"/>
+ </state>
+ <connections>
+ <action selector="onRealtime:" destination="BYZ-38-t0r" eventType="touchUpInside" id="nhn-jT-aQJ"/>
+ </connections>
+ </button>
</subviews>
<viewLayoutGuide key="safeArea" id="6Tk-OE-BBY"/>
<color key="backgroundColor" systemColor="systemBackgroundColor"/>
</constraints>
</view>
<connections>
+ <outlet property="buttonRealtime" destination="AaW-T2-Ndw" id="gcU-Ol-BOo"/>
<outlet property="buttonToggleCapture" destination="VOi-PT-Rbu" id="nis-VC-DQO"/>
<outlet property="buttonTranscribe" destination="Brs-xi-o8i" id="N8h-9W-ywb"/>
<outlet property="labelStatusInp" destination="Tgu-2q-eHQ" id="1hH-Ql-K6j"/>
{
int ggwaveId;
bool isCapturing;
+ bool isTranscribing;
+ bool isRealtime;
UILabel * labelReceived;
AudioQueueRef queue;
float * audioBufferF32;
struct whisper_context * ctx;
+
+ void * vc;
} StateInp;
@interface ViewController : UIViewController
@property (weak, nonatomic) IBOutlet UILabel *labelStatusInp;
@property (weak, nonatomic) IBOutlet UIButton *buttonToggleCapture;
@property (weak, nonatomic) IBOutlet UIButton *buttonTranscribe;
+@property (weak, nonatomic) IBOutlet UIButton *buttonRealtime;
@property (weak, nonatomic) IBOutlet UITextView *textviewResult;
@end
stateInp.audioBufferI16 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(int16_t));
stateInp.audioBufferF32 = malloc(MAX_AUDIO_SEC*SAMPLE_RATE*sizeof(float));
}
+
+ stateInp.isTranscribing = false;
+ stateInp.isRealtime = false;
}
-(IBAction) stopCapturing {
NSLog(@"Start capturing");
stateInp.n_samples = 0;
+ stateInp.vc = (__bridge void *)(self);
OSStatus status = AudioQueueNewInput(&stateInp.dataFormat,
AudioInputCallback,
- (IBAction)onTranscribePrepare:(id)sender {
_textviewResult.text = @"Processing - please wait ...";
+ if (stateInp.isRealtime) {
+ [self onRealtime:(id)sender];
+ }
+
if (stateInp.isCapturing) {
- // stop capturing
[self stopCapturing];
+ }
+}
- return;
+- (IBAction)onRealtime:(id)sender {
+ stateInp.isRealtime = !stateInp.isRealtime;
+
+ if (stateInp.isRealtime) {
+ [_buttonRealtime setBackgroundColor:[UIColor greenColor]];
+ } else {
+ [_buttonRealtime setBackgroundColor:[UIColor grayColor]];
}
+
+ NSLog(@"Realtime: %@", stateInp.isRealtime ? @"ON" : @"OFF");
}
- (IBAction)onTranscribe:(id)sender {
+ if (stateInp.isTranscribing) {
+ return;
+ }
+
NSLog(@"Processing %d samples", stateInp.n_samples);
- // process captured audio
- // convert I16 to F32
- for (int i = 0; i < stateInp.n_samples; i++) {
- stateInp.audioBufferF32[i] = (float)stateInp.audioBufferI16[i] / 32768.0f;
- }
+ stateInp.isTranscribing = true;
+
+ // dispatch the model to a background thread
+ dispatch_async(dispatch_get_global_queue(DISPATCH_QUEUE_PRIORITY_DEFAULT, 0), ^{
+ // process captured audio
+ // convert I16 to F32
+ for (int i = 0; i < self->stateInp.n_samples; i++) {
+ self->stateInp.audioBufferF32[i] = (float)self->stateInp.audioBufferI16[i] / 32768.0f;
+ }
- // run the model
- struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
+ // run the model
+ struct whisper_full_params params = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
- params.print_realtime = true;
- params.print_progress = false;
- params.print_timestamps = true;
- params.print_special = false;
- params.translate = false;
- params.language = "en";
- params.n_threads = 4;
- params.offset_ms = 0;
+ // get maximum number of threads on this device (max 8)
+ const int max_threads = MIN(8, (int)[[NSProcessInfo processInfo] processorCount]);
- CFTimeInterval startTime = CACurrentMediaTime();
+ params.print_realtime = true;
+ params.print_progress = false;
+ params.print_timestamps = true;
+ params.print_special = false;
+ params.translate = false;
+ params.language = "en";
+ params.n_threads = max_threads;
+ params.offset_ms = 0;
+ params.single_segment = self->stateInp.isRealtime;
- if (whisper_full(stateInp.ctx, params, stateInp.audioBufferF32, stateInp.n_samples) != 0) {
- NSLog(@"Failed to run the model");
- _textviewResult.text = @"Failed to run the model";
+ CFTimeInterval startTime = CACurrentMediaTime();
- return;
- }
+ whisper_reset_timings(self->stateInp.ctx);
- CFTimeInterval endTime = CACurrentMediaTime();
+ if (whisper_full(self->stateInp.ctx, params, self->stateInp.audioBufferF32, self->stateInp.n_samples) != 0) {
+ NSLog(@"Failed to run the model");
+ self->_textviewResult.text = @"Failed to run the model";
- // clear the text in the textview
- _textviewResult.text = @"";
+ return;
+ }
- int n_segments = whisper_full_n_segments(stateInp.ctx);
- for (int i = 0; i < n_segments; i++) {
- const char * text_cur = whisper_full_get_segment_text(stateInp.ctx, i);
+ whisper_print_timings(self->stateInp.ctx);
- // append the text to the textview
- _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
- }
+ CFTimeInterval endTime = CACurrentMediaTime();
+
+ NSLog(@"\nProcessing time: %5.3f, on %d threads", endTime - startTime, params.n_threads);
- // internal model timing
- whisper_print_timings(stateInp.ctx);
+ // result text
+ NSString *result = @"";
- NSLog(@"\nProcessing time: %5.3f", endTime - startTime);
+ int n_segments = whisper_full_n_segments(self->stateInp.ctx);
+ for (int i = 0; i < n_segments; i++) {
+ const char * text_cur = whisper_full_get_segment_text(self->stateInp.ctx, i);
- _textviewResult.text = [_textviewResult.text stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
+ // append the text to the result
+ result = [result stringByAppendingString:[NSString stringWithUTF8String:text_cur]];
+ }
+
+ // append processing time
+ result = [result stringByAppendingString:[NSString stringWithFormat:@"\n\n[processing time: %5.3f s]", endTime - startTime]];
+
+ // dispatch the result to the main thread
+ dispatch_async(dispatch_get_main_queue(), ^{
+ self->_textviewResult.text = result;
+ self->stateInp.isTranscribing = false;
+ });
+ });
}
//
-// Callback implmentation
+// Callback implementation
//
void AudioInputCallback(void * inUserData,
if (stateInp->n_samples + n > MAX_AUDIO_SEC*SAMPLE_RATE) {
NSLog(@"Too much audio data, ignoring");
+
+ dispatch_async(dispatch_get_main_queue(), ^{
+ ViewController * vc = (__bridge ViewController *)(stateInp->vc);
+ [vc stopCapturing];
+ });
+
return;
}
// put the buffer back in the queue
AudioQueueEnqueueBuffer(stateInp->queue, inBuffer, 0, NULL);
+
+ if (stateInp->isRealtime) {
+ // dipatch onTranscribe() to the main thread
+ dispatch_async(dispatch_get_main_queue(), ^{
+ ViewController * vc = (__bridge ViewController *)(stateInp->vc);
+ [vc onTranscribe:nil];
+ });
+ }
}
@end
ctx->t_decode_us = 0;
}
+const char * whisper_print_system_info(void) {
+ static std::string s;
+
+ s = "";
+ s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
+ s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
+ s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
+ s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
+ s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
+ s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
+ s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
+
+ return s.c_str();
+}
+
////////////////////////////////////////////////////////////////////////////
struct whisper_full_params whisper_full_default_params(enum whisper_sampling_strategy strategy) {
struct whisper_full_params params,
const float * samples,
int n_samples,
- const int n_processors) {
+ int n_processors) {
if (n_processors == 1) {
return whisper_full(ctx, params, samples, n_samples);
}
return ctx->result_all[i_segment].tokens[i_token].p;
}
-const char * whisper_print_system_info(void) {
- static std::string s;
-
- s = "";
- s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
- s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
- s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
- s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
- s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
- s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
- s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
-
- return s.c_str();
-}
-
// =================================================================================================
//
whisper_token id; // token id
whisper_token tid; // forced timestamp token id
- float p; // probability of the token
- float pt; // probability of the timestamp token
- float ptsum; // sum of probabilities of all timestamp tokens
+ float p; // probability of the token
+ float pt; // probability of the timestamp token
+ float ptsum; // sum of probabilities of all timestamp tokens
// token-level timestamp data
// do not use if you haven't computed token-level timestamps
- int64_t t0; // start time of the token
- int64_t t1; // end time of the token
+ int64_t t0; // start time of the token
+ int64_t t1; // end time of the token
- float vlen; // voice length of the token
+ float vlen; // voice length of the token
} whisper_token_data;
// Allocates all memory needed for the model and loads the model from the given file.
// Returns 0 on success
WHISPER_API int whisper_pcm_to_mel(
struct whisper_context * ctx,
- const float * samples,
- int n_samples,
- int n_threads);
+ const float * samples,
+ int n_samples,
+ int n_threads);
// This can be used to set a custom log mel spectrogram inside the provided whisper context.
// Use this instead of whisper_pcm_to_mel() if you want to provide your own log mel spectrogram.
// Returns 0 on success
WHISPER_API int whisper_set_mel(
struct whisper_context * ctx,
- const float * data,
- int n_len,
- int n_mel);
+ const float * data,
+ int n_len,
+ int n_mel);
// Run the Whisper encoder on the log mel spectrogram stored inside the provided whisper context.
// Make sure to call whisper_pcm_to_mel() or whisper_set_mel() first.
// Returns 0 on success
WHISPER_API int whisper_encode(
struct whisper_context * ctx,
- int offset,
- int n_threads);
+ int offset,
+ int n_threads);
// Run the Whisper decoder to obtain the logits and probabilities for the next token.
// Make sure to call whisper_encode() first.
// Returns 0 on success
WHISPER_API int whisper_decode(
struct whisper_context * ctx,
- const whisper_token * tokens,
- int n_tokens,
- int n_past,
- int n_threads);
+ const whisper_token * tokens,
+ int n_tokens,
+ int n_past,
+ int n_threads);
// Token sampling methods.
// These are provided for convenience and can be used after each call to whisper_decode().
WHISPER_API void whisper_print_timings(struct whisper_context * ctx);
WHISPER_API void whisper_reset_timings(struct whisper_context * ctx);
+ // Print system information
+ WHISPER_API const char * whisper_print_system_info(void);
+
////////////////////////////////////////////////////////////////////////////
// Available sampling strategies
int n_threads;
int n_max_text_ctx;
- int offset_ms; // start offset in ms
- int duration_ms; // audio duration to process in ms
+ int offset_ms; // start offset in ms
+ int duration_ms; // audio duration to process in ms
bool translate;
bool no_context;
- bool single_segment; // force single segment output (useful for streaming)
+ bool single_segment; // force single segment output (useful for streaming)
bool print_special;
bool print_progress;
bool print_realtime;
int max_tokens; // max tokens per segment (0 = no limit)
// [EXPERIMENTAL] speed-up techniques
- bool speed_up; // speed-up the audio by 2x using Phase Vocoder
- int audio_ctx; // overwrite the audio context size (0 = use default)
+ bool speed_up; // speed-up the audio by 2x using Phase Vocoder
+ int audio_ctx; // overwrite the audio context size (0 = use default)
// tokens to provide the whisper model as initial prompt
// these are prepended to any existing text context from a previous call
// Run the entire model: PCM -> log mel spectrogram -> encoder -> decoder -> text
// Uses the specified decoding strategy to obtain the text.
WHISPER_API int whisper_full(
- struct whisper_context * ctx,
- struct whisper_full_params params,
- const float * samples,
- int n_samples);
+ struct whisper_context * ctx,
+ struct whisper_full_params params,
+ const float * samples,
+ int n_samples);
// Split the input audio in chunks and process each chunk separately using whisper_full()
// It seems this approach can offer some speedup in some cases.
// However, the transcription accuracy can be worse at the beginning and end of each chunk.
WHISPER_API int whisper_full_parallel(
- struct whisper_context * ctx,
- struct whisper_full_params params,
- const float * samples,
- int n_samples,
- const int n_processors);
+ struct whisper_context * ctx,
+ struct whisper_full_params params,
+ const float * samples,
+ int n_samples,
+ int n_processors);
// Number of generated text segments.
// A segment can be a few words, a sentence, or even a paragraph.
// Get the probability of the specified token in the specified segment.
WHISPER_API float whisper_full_get_token_p(struct whisper_context * ctx, int i_segment, int i_token);
- // Print system information
- WHISPER_API const char * whisper_print_system_info(void);
-
#ifdef __cplusplus
}
#endif