--- /dev/null
+#include "ggml.h"
+#include "whisper.h"
+
+#include <emscripten.h>
+#include <emscripten/bind.h>
+
+#include <atomic>
+#include <cmath>
+#include <mutex>
+#include <string>
+#include <thread>
+#include <vector>
+
+constexpr int N_THREAD = 8;
+
+std::vector<struct whisper_context *> g_contexts(4, nullptr);
+
+std::mutex g_mutex;
+std::thread g_worker;
+
+std::atomic<bool> g_running(false);
+
+std::string g_status = "";
+std::string g_status_forced = "";
+std::string g_transcribed = "";
+
+std::vector<float> g_pcmf32;
+
+void stream_set_status(const std::string & status) {
+ std::lock_guard<std::mutex> lock(g_mutex);
+ g_status = status;
+}
+
+void stream_main(size_t index) {
+ stream_set_status("loading data ...");
+
+ struct whisper_full_params wparams = whisper_full_default_params(whisper_sampling_strategy::WHISPER_SAMPLING_GREEDY);
+
+ wparams.n_threads = std::min(N_THREAD, (int) std::thread::hardware_concurrency());
+ wparams.offset_ms = 0;
+ wparams.translate = false;
+ wparams.no_context = true;
+ wparams.single_segment = true;
+ wparams.print_realtime = false;
+ wparams.print_progress = false;
+ wparams.print_timestamps = true;
+ wparams.print_special = false;
+
+ wparams.max_tokens = 32;
+ wparams.audio_ctx = 768; // partial encoder context for better performance
+
+ wparams.language = "en";
+
+ printf("stream: using %d threads\n", N_THREAD);
+
+ std::vector<float> pcmf32;
+
+ // whisper context
+ auto & ctx = g_contexts[index];
+
+ // 5 seconds interval
+ const int64_t window_samples = 5*WHISPER_SAMPLE_RATE;
+
+ while (g_running) {
+ stream_set_status("waiting for audio ...");
+
+ {
+ std::unique_lock<std::mutex> lock(g_mutex);
+
+ if (g_pcmf32.size() < 1024) {
+ lock.unlock();
+
+ std::this_thread::sleep_for(std::chrono::milliseconds(10));
+
+ continue;
+ }
+
+ pcmf32 = std::vector<float>(g_pcmf32.end() - std::min((int64_t) g_pcmf32.size(), window_samples), g_pcmf32.end());
+ g_pcmf32.clear();
+ }
+
+ {
+ const auto t_start = std::chrono::high_resolution_clock::now();
+
+ stream_set_status("running whisper ...");
+
+ int ret = whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size());
+ if (ret != 0) {
+ printf("whisper_full() failed: %d\n", ret);
+ break;
+ }
+
+ const auto t_end = std::chrono::high_resolution_clock::now();
+
+ printf("stream: whisper_full() returned %d in %f seconds\n", ret, std::chrono::duration<double>(t_end - t_start).count());
+ }
+
+ {
+ std::string text_heard;
+
+ {
+ const int n_segments = whisper_full_n_segments(ctx);
+ for (int i = n_segments - 1; i < n_segments; ++i) {
+ const char * text = whisper_full_get_segment_text(ctx, i);
+
+ const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+ const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+ printf("transcribed: %s\n", text);
+
+ text_heard += text;
+ }
+ }
+
+ {
+ std::lock_guard<std::mutex> lock(g_mutex);
+ g_transcribed = text_heard;
+ }
+ }
+ }
+
+ if (index < g_contexts.size()) {
+ whisper_free(g_contexts[index]);
+ g_contexts[index] = nullptr;
+ }
+}
+
+EMSCRIPTEN_BINDINGS(stream) {
+ emscripten::function("init", emscripten::optional_override([](const std::string & path_model) {
+ for (size_t i = 0; i < g_contexts.size(); ++i) {
+ if (g_contexts[i] == nullptr) {
+ g_contexts[i] = whisper_init(path_model.c_str());
+ if (g_contexts[i] != nullptr) {
+ g_running = true;
+ if (g_worker.joinable()) {
+ g_worker.join();
+ }
+ g_worker = std::thread([i]() {
+ stream_main(i);
+ });
+
+ return i + 1;
+ } else {
+ return (size_t) 0;
+ }
+ }
+ }
+
+ return (size_t) 0;
+ }));
+
+ emscripten::function("free", emscripten::optional_override([](size_t index) {
+ if (g_running) {
+ g_running = false;
+ }
+ }));
+
+ emscripten::function("set_audio", emscripten::optional_override([](size_t index, const emscripten::val & audio) {
+ --index;
+
+ if (index >= g_contexts.size()) {
+ return -1;
+ }
+
+ if (g_contexts[index] == nullptr) {
+ return -2;
+ }
+
+ {
+ std::lock_guard<std::mutex> lock(g_mutex);
+ const int n = audio["length"].as<int>();
+
+ emscripten::val heap = emscripten::val::module_property("HEAPU8");
+ emscripten::val memory = heap["buffer"];
+
+ g_pcmf32.resize(n);
+
+ emscripten::val memoryView = audio["constructor"].new_(memory, reinterpret_cast<uintptr_t>(g_pcmf32.data()), n);
+ memoryView.call<void>("set", audio);
+ }
+
+ return 0;
+ }));
+
+ emscripten::function("get_transcribed", emscripten::optional_override([]() {
+ std::string transcribed;
+
+ {
+ std::lock_guard<std::mutex> lock(g_mutex);
+ transcribed = std::move(g_transcribed);
+ }
+
+ return transcribed;
+ }));
+
+ emscripten::function("get_status", emscripten::optional_override([]() {
+ std::string status;
+
+ {
+ std::lock_guard<std::mutex> lock(g_mutex);
+ status = g_status_forced.empty() ? g_status : g_status_forced;
+ }
+
+ return status;
+ }));
+
+ emscripten::function("set_status", emscripten::optional_override([](const std::string & status) {
+ {
+ std::lock_guard<std::mutex> lock(g_mutex);
+ g_status_forced = status;
+ }
+ }));
+}
--- /dev/null
+<!doctype html>
+<html lang="en-us">
+ <head>
+ <title>stream : Real-time Whisper transcription in WebAssembly</title>
+
+ <style>
+ #output {
+ width: 100%;
+ height: 100%;
+ margin: 0 auto;
+ margin-top: 10px;
+ border-left: 0px;
+ border-right: 0px;
+ padding-left: 0px;
+ padding-right: 0px;
+ display: block;
+ background-color: black;
+ color: white;
+ font-size: 10px;
+ font-family: 'Lucida Console', Monaco, monospace;
+ outline: none;
+ white-space: pre;
+ overflow-wrap: normal;
+ overflow-x: scroll;
+ }
+ </style>
+ </head>
+ <body>
+ <div id="main-container">
+ <b>stream : Real-time Whisper transcription in WebAssembly</b>
+
+ <br><br>
+
+ You can find more about this project on <a href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/stream.wasm">GitHub</a>.
+
+ <br><br>
+
+ <hr>
+
+ Select the model you would like to use, click the "Start" button and start speaking
+
+ <br><br>
+
+ <div id="model-whisper">
+ Whisper model: <span id="model-whisper-status"></span>
+ <button id="fetch-whisper-tiny-en" onclick="loadWhisper('tiny.en')">tiny.en (75 MB)</button>
+ <button id="fetch-whisper-base-en" onclick="loadWhisper('base.en')">base.en (142 MB)</button>
+ <span id="fetch-whisper-progress"></span>
+
+ <!--
+ <input type="file" id="file" name="file" onchange="loadFile(event, 'whisper.bin')" />
+ -->
+ </div>
+
+ <br>
+
+ <div id="input">
+ <button id="start" onclick="onStart()" disabled>Start</button>
+ <button id="stop" onclick="onStop()" disabled>Stop</button>
+ <button id="clear" onclick="clearCache()">Clear Cache</button>
+ </div>
+
+ <br>
+
+ <div id="state">
+ Status: <b><span id="state-status">not started</span></b>
+
+ <pre id="state-transcribed">[The transcribed text will be displayed here]</pre>
+ </div>
+
+ <hr>
+
+ Debug output:
+ <textarea id="output" rows="20"></textarea>
+
+ <br>
+
+ <b>Troubleshooting</b>
+
+ <br><br>
+
+ The page does some heavy computations, so make sure:
+
+ <ul>
+ <li>To use a modern web browser (e.g. Chrome, Firefox)</li>
+ <li>To use a fast desktop or laptop computer (i.e. not a mobile phone)</li>
+ <li>Your browser supports WASM <a href="https://webassembly.org/roadmap/">Fixed-width SIMD</a></li>
+ </ul>
+
+ <div class="cell-version">
+ <span>
+ |
+ Build time: <span class="nav-link">@GIT_DATE@</span> |
+ Commit hash: <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/commit/@GIT_SHA1@">@GIT_SHA1@</a> |
+ Commit subject: <span class="nav-link">@GIT_COMMIT_SUBJECT@</span> |
+ <a class="nav-link" href="https://github.com/ggerganov/whisper.cpp/tree/master/examples/stream.wasm">Source Code</a> |
+ </span>
+ </div>
+ </div>
+
+ <script type="text/javascript" src="helpers.js"></script>
+ <script type='text/javascript'>
+ const kRestartRecording_s = 15;
+ const kSampleRate = 16000;
+
+ window.AudioContext = window.AudioContext || window.webkitAudioContext;
+ window.OfflineAudioContext = window.OfflineAudioContext || window.webkitOfflineAudioContext;
+
+ // web audio context
+ var context = null;
+
+ // audio data
+ var audio = null;
+ var audio0 = null;
+
+ // the stream instance
+ var instance = null;
+
+ // model name
+ var model_whisper = null;
+
+ var Module = {
+ print: printTextarea,
+ printErr: printTextarea,
+ setStatus: function(text) {
+ printTextarea('js: ' + text);
+ },
+ monitorRunDependencies: function(left) {
+ },
+ preRun: function() {
+ printTextarea('js: Preparing ...');
+ },
+ postRun: function() {
+ printTextarea('js: Initialized successfully!');
+ }
+ };
+
+ //
+ // fetch models
+ //
+
+ let dbVersion = 1
+ let dbName = 'whisper.ggerganov.com';
+ let indexedDB = window.indexedDB || window.mozIndexedDB || window.webkitIndexedDB || window.msIndexedDB
+
+ function storeFS(fname, buf) {
+ // write to WASM file using FS_createDataFile
+ // if the file exists, delete it
+ try {
+ Module.FS_unlink(fname);
+ } catch (e) {
+ // ignore
+ }
+
+ Module.FS_createDataFile("/", fname, buf, true, true);
+
+ printTextarea('storeFS: stored model: ' + fname + ' size: ' + buf.length);
+
+ document.getElementById('model-whisper-status').innerHTML = 'loaded "' + model_whisper + '"!';
+
+ if (model_whisper != null) {
+ document.getElementById('start').disabled = false;
+ document.getElementById('stop' ).disabled = true;
+ }
+ }
+
+ function loadWhisper(model) {
+ let urls = {
+ 'tiny.en': 'https://whisper.ggerganov.com/ggml-model-whisper-tiny.en.bin',
+ 'base.en': 'https://whisper.ggerganov.com/ggml-model-whisper-base.en.bin',
+ };
+
+ let sizes = {
+ 'tiny.en': 75,
+ 'base.en': 142,
+ };
+
+ let url = urls[model];
+ let dst = 'whisper.bin';
+ let size_mb = sizes[model];
+
+ model_whisper = model;
+
+ document.getElementById('fetch-whisper-tiny-en').style.display = 'none';
+ document.getElementById('fetch-whisper-base-en').style.display = 'none';
+ document.getElementById('model-whisper-status').innerHTML = 'loading "' + model + '" ... ';
+
+ cbProgress = function(p) {
+ let el = document.getElementById('fetch-whisper-progress');
+ el.innerHTML = Math.round(100*p) + '%';
+ };
+
+ cbCancel = function() {
+ var el;
+ el = document.getElementById('fetch-whisper-tiny-en'); if (el) el.style.display = 'inline-block';
+ el = document.getElementById('fetch-whisper-base-en'); if (el) el.style.display = 'inline-block';
+ el = document.getElementById('model-whisper-status'); if (el) el.innerHTML = '';
+ };
+
+ loadRemote(url, dst, size_mb, cbProgress, storeFS, cbCancel, printTextarea);
+ }
+
+ //
+ // microphone
+ //
+
+ var mediaRecorder = null;
+ var doRecording = false;
+ var startTime = 0;
+
+ function stopRecording() {
+ Module.set_status("paused");
+ doRecording = false;
+ audio0 = null;
+ audio = null;
+ context = null;
+ }
+
+ function startRecording() {
+ if (!context) {
+ context = new AudioContext({
+ sampleRate: 16000,
+ channelCount: 1,
+ echoCancellation: false,
+ autoGainControl: true,
+ noiseSuppression: true,
+ });
+ }
+
+ Module.set_status("");
+
+ document.getElementById('start').disabled = true;
+ document.getElementById('stop').disabled = false;
+
+ doRecording = true;
+ startTime = Date.now();
+
+ var chunks = [];
+ var stream = null;
+
+ navigator.mediaDevices.getUserMedia({audio: true, video: false})
+ .then(function(s) {
+ stream = s;
+ mediaRecorder = new MediaRecorder(stream);
+ mediaRecorder.ondataavailable = function(e) {
+ chunks.push(e.data);
+
+ var blob = new Blob(chunks, { 'type' : 'audio/ogg; codecs=opus' });
+ var reader = new FileReader();
+
+ reader.onload = function(event) {
+ var buf = new Uint8Array(reader.result);
+
+ if (!context) {
+ return;
+ }
+ context.decodeAudioData(buf.buffer, function(audioBuffer) {
+ var offlineContext = new OfflineAudioContext(audioBuffer.numberOfChannels, audioBuffer.length, audioBuffer.sampleRate);
+ var source = offlineContext.createBufferSource();
+ source.buffer = audioBuffer;
+ source.connect(offlineContext.destination);
+ source.start(0);
+
+ offlineContext.startRendering().then(function(renderedBuffer) {
+ audio = renderedBuffer.getChannelData(0);
+
+ //printTextarea('js: audio recorded, size: ' + audio.length + ', old size: ' + (audio0 == null ? 0 : audio0.length));
+
+ var audioAll = new Float32Array(audio0 == null ? audio.length : audio0.length + audio.length);
+ if (audio0 != null) {
+ audioAll.set(audio0, 0);
+ }
+ audioAll.set(audio, audio0 == null ? 0 : audio0.length);
+
+ if (instance) {
+ Module.set_audio(instance, audioAll);
+ }
+ });
+ }, function(e) {
+ audio = null;
+ });
+ }
+
+ reader.readAsArrayBuffer(blob);
+ };
+
+ mediaRecorder.onstop = function(e) {
+ if (doRecording) {
+ setTimeout(function() {
+ startRecording();
+ });
+ }
+ };
+
+ mediaRecorder.start(5000);
+ })
+ .catch(function(err) {
+ printTextarea('js: error getting audio stream: ' + err);
+ });
+
+ var interval = setInterval(function() {
+ if (!doRecording) {
+ clearInterval(interval);
+ mediaRecorder.stop();
+ stream.getTracks().forEach(function(track) {
+ track.stop();
+ });
+
+ document.getElementById('start').disabled = false;
+ document.getElementById('stop').disabled = true;
+
+ mediaRecorder = null;
+ }
+
+ // if audio length is more than kRestartRecording_s seconds, restart recording
+ if (audio != null && audio.length > kSampleRate*kRestartRecording_s) {
+ if (doRecording) {
+ //printTextarea('js: restarting recording');
+
+ clearInterval(interval);
+ audio0 = audio;
+ audio = null;
+ mediaRecorder.stop();
+ stream.getTracks().forEach(function(track) {
+ track.stop();
+ });
+ }
+ }
+ }, 250);
+ }
+
+ //
+ // main
+ //
+
+ var nLines = 0;
+ var intervalUpdate = null;
+ var transcribedAll = '';
+
+ function onStart() {
+ if (!instance) {
+ instance = Module.init('whisper.bin');
+
+ if (instance) {
+ printTextarea("js: whisper initialized, instance: " + instance);
+ }
+ }
+
+ if (!instance) {
+ printTextarea("js: failed to initialize whisper");
+ return;
+ }
+
+ startRecording();
+
+ intervalUpdate = setInterval(function() {
+ var transcribed = Module.get_transcribed();
+
+ if (transcribed != null && transcribed.length > 1) {
+ transcribedAll += transcribed + '<br>';
+ nLines++;
+
+ // if more than 10 lines, remove the first line
+ if (nLines > 10) {
+ var i = transcribedAll.indexOf('<br>');
+ if (i > 0) {
+ transcribedAll = transcribedAll.substring(i + 4);
+ nLines--;
+ }
+ }
+ }
+
+ document.getElementById('state-status').innerHTML = Module.get_status();
+ document.getElementById('state-transcribed').innerHTML = transcribedAll;
+ }, 100);
+ }
+
+ function onStop() {
+ stopRecording();
+ }
+
+ </script>
+ <script type="text/javascript" src="stream.js"></script>
+ </body>
+</html>