#define _USE_MATH_DEFINES
#include <cmath>
#include <cstdio>
+#include <cstdarg>
#include <cstring>
#include <fstream>
#include <map>
} while (0)
#define BYTESWAP_TENSOR(t) \
do { \
- byteswap_tensor(tensor); \
+ byteswap_tensor(t); \
} while (0)
#else
#define BYTESWAP_VALUE(d) do {} while (0)
#define WHISPER_ASSERT(x) \
do { \
if (!(x)) { \
- fprintf(stderr, "WHISPER_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
+ log("WHISPER_ASSERT: %s:%d: %s\n", __FILE__, __LINE__, #x); \
abort(); \
} \
} while (0)
struct whisper_sequence {
std::vector<whisper_token_data> tokens;
- // the accumulated transcription in the current interation (used to truncate the tokens array)
+ // the accumulated transcription in the current iteration (used to truncate the tokens array)
int result_len;
double sum_logprobs_all; // the sum of the log probabilities of the tokens
std::string path_model; // populated by whisper_init_from_file()
};
+static void whisper_default_log(const char * text) {
+ fprintf(stderr, "%s", text);
+}
+
+static whisper_log_callback whisper_log = whisper_default_log;
+
+// TODO: fix compile warning about "format string is not a string literal"
+static void log(const char * fmt, ...) {
+ if (!whisper_log) return;
+ char buf[1024];
+ va_list args;
+ va_start(args, fmt);
+ vsnprintf(buf, sizeof(buf), fmt, args);
+ whisper_log(buf);
+}
+
template<typename T>
static void read_safe(whisper_model_loader * loader, T & dest) {
loader->read(loader->context, &dest, sizeof(T));
cache.ctx = ggml_init(params);
if (!cache.ctx) {
- fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
+ log("%s: failed to allocate memory for kv cache\n", __func__);
return false;
}
cache.ctx = ggml_init(params);
if (!cache.ctx) {
- fprintf(stderr, "%s: failed to allocate memory for kv cache\n", __func__);
+ log("%s: failed to allocate memory for kv cache\n", __func__);
return false;
}
// see the convert-pt-to-ggml.py script for details
//
static bool whisper_model_load(struct whisper_model_loader * loader, whisper_context & wctx) {
- fprintf(stderr, "%s: loading model\n", __func__);
+ log("%s: loading model\n", __func__);
const int64_t t_start_us = ggml_time_us();
uint32_t magic;
read_safe(loader, magic);
if (magic != GGML_FILE_MAGIC) {
- fprintf(stderr, "%s: invalid model data (bad magic)\n", __func__);
+ log("%s: invalid model data (bad magic)\n", __func__);
return false;
}
}
// in order to save memory and also to speed up the computation
wctx.wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
if (wctx.wtype == GGML_TYPE_COUNT) {
- fprintf(stderr, "%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
+ log("%s: invalid model (bad ftype value %d)\n", __func__, model.hparams.ftype);
return false;
}
const size_t scale = model.hparams.ftype ? 1 : 2;
- fprintf(stderr, "%s: n_vocab = %d\n", __func__, hparams.n_vocab);
- fprintf(stderr, "%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
- fprintf(stderr, "%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
- fprintf(stderr, "%s: n_audio_head = %d\n", __func__, hparams.n_audio_head);
- fprintf(stderr, "%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
- fprintf(stderr, "%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx);
- fprintf(stderr, "%s: n_text_state = %d\n", __func__, hparams.n_text_state);
- fprintf(stderr, "%s: n_text_head = %d\n", __func__, hparams.n_text_head);
- fprintf(stderr, "%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
- fprintf(stderr, "%s: n_mels = %d\n", __func__, hparams.n_mels);
- fprintf(stderr, "%s: ftype = %d\n", __func__, model.hparams.ftype);
- fprintf(stderr, "%s: qntvr = %d\n", __func__, qntvr);
- fprintf(stderr, "%s: type = %d\n", __func__, model.type);
+ log("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+ log("%s: n_audio_ctx = %d\n", __func__, hparams.n_audio_ctx);
+ log("%s: n_audio_state = %d\n", __func__, hparams.n_audio_state);
+ log("%s: n_audio_head = %d\n", __func__, hparams.n_audio_head);
+ log("%s: n_audio_layer = %d\n", __func__, hparams.n_audio_layer);
+ log("%s: n_text_ctx = %d\n", __func__, hparams.n_text_ctx);
+ log("%s: n_text_state = %d\n", __func__, hparams.n_text_state);
+ log("%s: n_text_head = %d\n", __func__, hparams.n_text_head);
+ log("%s: n_text_layer = %d\n", __func__, hparams.n_text_layer);
+ log("%s: n_mels = %d\n", __func__, hparams.n_mels);
+ log("%s: ftype = %d\n", __func__, model.hparams.ftype);
+ log("%s: qntvr = %d\n", __func__, qntvr);
+ log("%s: type = %d\n", __func__, model.type);
// print memory requirements
{
const size_t mem_required_decoder =
scale*MEM_REQ_KV_SELF.at(model.type);
- fprintf(stderr, "%s: mem required = %7.2f MB (+ %7.2f MB per decoder)\n", __func__,
+ log("%s: mem required = %7.2f MB (+ %7.2f MB per decoder)\n", __func__,
mem_required / 1024.0 / 1024.0, mem_required_decoder / 1024.0 / 1024.0);
}
read_safe(loader, n_vocab);
//if (n_vocab != model.hparams.n_vocab) {
- // fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+ // log("%s: invalid model file '%s' (bad vocab size %d != %d)\n",
// __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
// return false;
//}
word.assign(&tmp[0], tmp.size());
} else {
// seems like we have an empty-string token in multi-language models (i = 50256)
- //fprintf(stderr, "%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
+ //log("%s: warning: empty-string token in vocab, i = %d\n", __func__, i);
word = "";
}
}
if (n_vocab < model.hparams.n_vocab) {
- fprintf(stderr, "%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
+ log("%s: adding %d extra tokens\n", __func__, model.hparams.n_vocab - n_vocab);
for (int i = n_vocab; i < model.hparams.n_vocab; i++) {
if (i > vocab.token_beg) {
word = "[_TT_" + std::to_string(i - vocab.token_beg) + "]";
ctx_size += (15 + 15*n_audio_layer + 24*n_text_layer)*512; // object overhead
- fprintf(stderr, "%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+ log("%s: model ctx = %7.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
}
// create the ggml context
model.ctx = ggml_init(params);
if (!model.ctx) {
- fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+ log("%s: ggml_init() failed\n", __func__);
return false;
}
}
name.assign(&tmp[0], tmp.size());
if (model.tensors.find(name) == model.tensors.end()) {
- fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.data());
+ log("%s: unknown tensor '%s' in model file\n", __func__, name.data());
return false;
}
auto tensor = model.tensors[name.data()];
if (ggml_nelements(tensor) != nelements) {
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
- fprintf(stderr, "%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
+ log("%s: tensor '%s' has wrong size in model file\n", __func__, name.data());
+ log("%s: shape: [%d, %d, %d], expected: [%d, %d, %d]\n",
__func__, ne[0], ne[1], ne[2], (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2]);
return false;
}
if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1] || tensor->ne[2] != ne[2]) {
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
+ log("%s: tensor '%s' has wrong shape in model file: got [%d, %d, %d], expected [%d, %d, %d]\n",
__func__, name.data(), (int) tensor->ne[0], (int) tensor->ne[1], (int) tensor->ne[2], ne[0], ne[1], ne[2]);
return false;
}
const size_t bpe = ggml_type_size(ggml_type(ttype));
if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+ log("%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
__func__, name.data(), ggml_nbytes(tensor), nelements*bpe);
return false;
}
model.n_loaded++;
}
- fprintf(stderr, "%s: model size = %7.2f MB\n", __func__, total_size/1024.0/1024.0);
+ log("%s: model size = %7.2f MB\n", __func__, total_size/1024.0/1024.0);
if (model.n_loaded == 0) {
- fprintf(stderr, "%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
+ log("%s: WARN no tensors loaded from model file - assuming empty model for testing\n", __func__);
} else if (model.n_loaded != (int) model.tensors.size()) {
- fprintf(stderr, "%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
+ log("%s: ERROR not all tensors loaded from model file - expected %zu, got %d\n", __func__, model.tensors.size(), model.n_loaded);
return false;
}
}
{
struct ggml_cgraph gf = {};
- ggml_build_forward_expand(&gf, cur);
+ ggml_build_forward_expand (&gf, cur);
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
//ggml_graph_print(&gf);
// run the computation
{
- ggml_build_forward_expand(&gf, logits);
+ ggml_build_forward_expand (&gf, logits);
ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
}
return std::string(buf);
}
+#define SIN_COS_N_COUNT WHISPER_N_FFT
+static float sin_vals[SIN_COS_N_COUNT];
+static float cos_vals[SIN_COS_N_COUNT];
+
+// In FFT, we frequently use sine and cosine operations with the same values.
+// We can use precalculated values to speed up the process.
+static void fill_sin_cos_table() {
+ static bool is_filled = false;
+ if (is_filled) return;
+ for (int i = 0; i < SIN_COS_N_COUNT; i++) {
+ double theta = (2*M_PI*i)/SIN_COS_N_COUNT;
+ sin_vals[i] = sinf(theta);
+ cos_vals[i] = cosf(theta);
+ }
+ is_filled = true;
+}
+
// naive Discrete Fourier Transform
// input is real-valued
// output is complex-valued
int N = in.size();
out.resize(N*2);
+ const int sin_cos_step = SIN_COS_N_COUNT / N;
for (int k = 0; k < N; k++) {
float re = 0;
float im = 0;
for (int n = 0; n < N; n++) {
- float angle = 2*M_PI*k*n/N;
- re += in[n]*cos(angle);
- im -= in[n]*sin(angle);
+ int idx = (k * n * sin_cos_step) % (SIN_COS_N_COUNT); // t = 2*M_PI*k*n/N
+ re += in[n]*cos_vals[idx]; // cos(t)
+ im -= in[n]*sin_vals[idx]; // sin(t)
}
out[k*2 + 0] = re;
fft(even, even_fft);
fft(odd, odd_fft);
+ const int sin_cos_step = SIN_COS_N_COUNT / N;
for (int k = 0; k < N/2; k++) {
- float theta = 2*M_PI*k/N;
-
- float re = cos(theta);
- float im = -sin(theta);
+ int idx = k * sin_cos_step; // t = 2*M_PI*k/N
+ float re = cos_vals[idx]; // cos(t)
+ float im = -sin_vals[idx]; // sin(t)
float re_odd = odd_fft[2*k + 0];
float im_odd = odd_fft[2*k + 1];
}
}
-static void log_mel_spectrogram_worker_thread(int ith, const std::vector<float> &hann, const float *samples,
- int n_samples, int fft_size, int fft_step, int n_threads,
- const whisper_filters &filters, bool speed_up, whisper_mel &mel) {
- std::vector<float> fft_in(fft_size, 0.0);
- std::vector<float> fft_out(2 * fft_size);
- int n_fft = 1 + (speed_up ? fft_size / 4 : fft_size / 2);
+static bool hann_window(int length, bool periodic, std::vector<float> & output) {
+ if (output.size() < length) {
+ output.resize(length);
+ }
+ int offset = -1;
+ if (periodic) {
+ offset = 0;
+ }
+ for (int i = 0; i < length; i++) {
+ output[i] = 0.5*(1.0 - cosf((2.0*M_PI*i)/(length + offset)));
+ }
- for (int i = ith; i < mel.n_len; i += n_threads) {
- const int offset = i * fft_step;
+ return true;
+}
- // apply Hanning window
- for (int j = 0; j < fft_size; j++) {
- if (offset + j < n_samples) {
- fft_in[j] = hann[j] * samples[offset + j];
- } else {
- fft_in[j] = 0.0;
- }
+static void log_mel_spectrogram_worker_thread(int ith, const std::vector<float> & hann, const std::vector<float> & samples,
+ int n_samples, int frame_size, int frame_step, int n_threads,
+ const whisper_filters & filters, whisper_mel & mel) {
+ std::vector<float> fft_in(frame_size, 0.0);
+ std::vector<float> fft_out(2 * frame_step);
+ // make sure n_fft == 1 + (WHISPER_N_FFT / 2), bin_0 to bin_nyquist
+ int n_fft = 1 + (frame_size / 2);
+ int i = ith;
+
+ // calculate FFT only when fft_in are not all zero
+ for (; i < std::min(n_samples / frame_step + 1, mel.n_len); i += n_threads) {
+ const int offset = i * frame_step;
+
+ // apply Hanning window (~10% faster)
+ for (int j = 0; j < std::min(frame_size, n_samples - offset); j++) {
+ fft_in[j] = hann[j] * samples[offset + j];
+ }
+ // fill the rest with zeros
+ if (n_samples - offset < frame_size) {
+ std::fill(fft_in.begin() + (n_samples - offset), fft_in.end(), 0.0);
}
- // FFT -> mag^2
+ // FFT
fft(fft_in, fft_out);
- for (int j = 0; j < fft_size; j++) {
+ // Calculate modulus^2 of complex numbers
+ // Use pow(fft_out[2 * j + 0], 2) + pow(fft_out[2 * j + 1], 2) causes inference quality problem? Interesting.
+ for (int j = 0; j < frame_size; j++) {
fft_out[j] = (fft_out[2 * j + 0] * fft_out[2 * j + 0] + fft_out[2 * j + 1] * fft_out[2 * j + 1]);
}
- for (int j = 1; j < fft_size / 2; j++) {
- fft_out[j] += fft_out[fft_size - j];
- }
-
- if (speed_up) {
- // scale down in the frequency domain results in a speed up in the time domain
- for (int j = 0; j < n_fft; j++) {
- fft_out[j] = 0.5 * (fft_out[2 * j] + fft_out[2 * j + 1]);
- }
- }
// mel spectrogram
for (int j = 0; j < mel.n_mel; j++) {
int k = 0;
for (k = 0; k < n_fft - 3; k += 4) {
sum +=
- fft_out[k + 0] * filters.data[j*n_fft + k + 0] +
- fft_out[k + 1] * filters.data[j*n_fft + k + 1] +
- fft_out[k + 2] * filters.data[j*n_fft + k + 2] +
- fft_out[k + 3] * filters.data[j*n_fft + k + 3];
+ fft_out[k + 0] * filters.data[j * n_fft + k + 0] +
+ fft_out[k + 1] * filters.data[j * n_fft + k + 1] +
+ fft_out[k + 2] * filters.data[j * n_fft + k + 2] +
+ fft_out[k + 3] * filters.data[j * n_fft + k + 3];
}
// handle n_fft remainder
mel.data[j * mel.n_len + i] = sum;
}
}
+
+ // Otherwise fft_out are all zero
+ double sum = log10(1e-10);
+ for (; i < mel.n_len; i += n_threads) {
+ for (int j = 0; j < mel.n_mel; j++) {
+ mel.data[j * mel.n_len + i] = sum;
+ }
+ }
}
-// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L92-L124
+// ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L110-L157
static bool log_mel_spectrogram(
- whisper_state & wstate,
- const float * samples,
+ whisper_state & wstate,
+ const float * samples,
const int n_samples,
const int /*sample_rate*/,
- const int fft_size,
- const int fft_step,
+ const int frame_size,
+ const int frame_step,
const int n_mel,
const int n_threads,
- const whisper_filters & filters,
- const bool speed_up,
- whisper_mel & mel) {
+ const whisper_filters & filters,
+ const bool debug,
+ whisper_mel & mel) {
const int64_t t_start_us = ggml_time_us();
- // Hanning window
+ // Hanning window (Use cosf to eliminate difference)
+ // ref: https://pytorch.org/docs/stable/generated/torch.hann_window.html
+ // ref: https://github.com/openai/whisper/blob/main/whisper/audio.py#L147
std::vector<float> hann;
- hann.resize(fft_size);
- for (int i = 0; i < fft_size; i++) {
- hann[i] = 0.5*(1.0 - cos((2.0*M_PI*i)/(fft_size)));
- }
+ hann_window(frame_size, true, hann);
- mel.n_mel = n_mel;
- mel.n_len = n_samples/fft_step;
- mel.n_len_org = mel.n_len;
- std::vector<float> samples_padded;
-
- // pad audio with at least one extra chunk of zeros
- {
- const int pad = (100*WHISPER_CHUNK_SIZE)/2;
+ // Calculate the length of padding
+ int64_t stage_1_pad = WHISPER_SAMPLE_RATE * 30;
+ int64_t stage_2_pad = frame_size / 2;
- if (mel.n_len % pad != 0) {
- mel.n_len = (mel.n_len/pad + 1)*pad;
- }
- mel.n_len += pad;
+ // Initialize a vector and copy data from C array to it.
+ std::vector<float> samples_padded;
+ samples_padded.resize(n_samples + stage_1_pad + stage_2_pad * 2);
+ std::copy(samples, samples + n_samples, samples_padded.begin() + stage_2_pad);
- samples_padded.resize(mel.n_len*fft_step);
- memcpy(samples_padded.data(), samples, n_samples*sizeof(float));
- memset(samples_padded.data() + n_samples, 0, (mel.n_len*fft_step - n_samples)*sizeof(float));
+ // pad 30 seconds of zeros at the end of audio (480,000 samples) + reflective pad 200 samples at the end of audio
+ std::fill(samples_padded.begin() + n_samples + stage_2_pad, samples_padded.begin() + n_samples + stage_1_pad + 2 * stage_2_pad, 0);
- samples = samples_padded.data();
- }
+ // reflective pad 200 samples at the beginning of audio
+ std::reverse_copy(samples + 1, samples + 1 + stage_2_pad, samples_padded.begin());
- mel.data.resize(mel.n_mel*mel.n_len);
+ mel.n_mel = n_mel;
+ // https://github.com/pytorch/pytorch/blob/main/aten/src/ATen/native/SpectralOps.cpp#L936
+ // Calculate number of frames + remove the last frame
+ mel.n_len = (samples_padded.size() - frame_size) / frame_step;
+ // Calculate semi-padded sample length to ensure compatibility
+ mel.n_len_org = 1 + (n_samples + stage_2_pad - frame_size) / frame_step;
+ mel.data.resize(mel.n_mel * mel.n_len);
- //printf("%s: n_samples = %d, n_len = %d\n", __func__, n_samples, mel.n_len);
- //printf("%s: recording length: %f s\n", __func__, (float) n_samples/sample_rate);
{
std::vector<std::thread> workers(n_threads - 1);
for (int iw = 0; iw < n_threads - 1; ++iw) {
workers[iw] = std::thread(
- log_mel_spectrogram_worker_thread, iw + 1, std::cref(hann), samples,
- n_samples, fft_size, fft_step, n_threads,
- std::cref(filters), speed_up, std::ref(mel));
+ log_mel_spectrogram_worker_thread, iw + 1, std::cref(hann), samples_padded,
+ n_samples + stage_2_pad, frame_size, frame_step, n_threads,
+ std::cref(filters), std::ref(mel));
}
// main thread
- log_mel_spectrogram_worker_thread(0, hann, samples, n_samples, fft_size, fft_step, n_threads, filters, speed_up, mel);
+ log_mel_spectrogram_worker_thread(0, hann, samples_padded, n_samples + stage_2_pad, frame_size, frame_step, n_threads, filters, mel);
for (int iw = 0; iw < n_threads - 1; ++iw) {
workers[iw].join();
mmax = mel.data[i];
}
}
- //printf("%s: max = %f\n", __func__, mmax);
mmax -= 8.0;
wstate.t_mel_us += ggml_time_us() - t_start_us;
- //printf("mel.n_len() = %d, divided by 1500: %f, n_samples / fft_step: %d\n", mel.n_len, mel.n_len / 1500.0, n_samples / fft_step);
+ // Dump log_mel_spectrogram
+ if (debug) {
+ std::ofstream outFile("log_mel_spectrogram.json");
+ outFile << "[";
+ for (uint64_t i = 0; i < mel.data.size() - 1; i++) {
+ outFile << mel.data[i] << ", ";
+ }
+ outFile << mel.data[mel.data.size() - 1] << "]";
+ outFile.close();
+ }
return true;
}
--j;
}
if (!found) {
- fprintf(stderr, "unknown token \n");
+ log("unknown token\n");
++i;
}
}
#endif
struct whisper_state * whisper_init_state(whisper_context * ctx) {
+ fill_sin_cos_table();
whisper_state * state = new whisper_state;
const size_t scale = ctx->model.hparams.ftype ? 1 : 2;
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_SELF.at(ctx->model.type), state->decoders[0].kv_self, ctx->itype, ctx->model.hparams.n_text_ctx)) {
- fprintf(stderr, "%s: kv_cache_init() failed for self-attention cache\n", __func__);
+ log("%s: kv_cache_init() failed for self-attention cache\n", __func__);
delete state;
return nullptr;
}
{
const size_t memory_size = ggml_nbytes(state->decoders[0].kv_self.k) + ggml_nbytes(state->decoders[0].kv_self.v);
- fprintf(stderr, "%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
+ log("%s: kv self size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
}
if (!kv_cache_init(ctx->model.hparams, scale * MEM_REQ_KV_CROSS.at(ctx->model.type), state->kv_cross, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
- fprintf(stderr, "%s: kv_cache_init() failed for cross-attention cache\n", __func__);
+ log("%s: kv_cache_init() failed for cross-attention cache\n", __func__);
delete state;
return nullptr;
}
{
const size_t memory_size = ggml_nbytes(state->kv_cross.k) + ggml_nbytes(state->kv_cross.v);
- fprintf(stderr, "%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
+ log("%s: kv cross size = %7.2f MB\n", __func__, memory_size / 1024.0 / 1024.0);
}
#ifdef WHISPER_USE_COREML
const auto path_coreml = whisper_get_coreml_path_encoder(ctx->path_model);
- fprintf(stderr, "%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
- fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
+ log("%s: loading Core ML model from '%s'\n", __func__, path_coreml.c_str());
+ log("%s: first run on a device may take a while ...\n", __func__);
state->ctx_coreml = whisper_coreml_init(path_coreml.c_str());
if (!state->ctx_coreml) {
- fprintf(stderr, "%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
+ log("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
#ifndef WHISPER_COREML_ALLOW_FALLBACK
return nullptr;
#endif
} else {
- fprintf(stderr, "%s: Core ML model loaded\n", __func__);
+ log("%s: Core ML model loaded\n", __func__);
}
#endif
return 1;
#else
if (!model_path && ctx->path_model.empty()) {
- fprintf(stderr, "%s: model_path is nullptr, and ctx has no model_path set.\n", __func__);
+ log("%s: model_path is nullptr, and ctx has no model_path set.\n", __func__);
return 1;
}
path_cache = cache_dir;
}
- fprintf(stderr, "%s: loading OpenVINO model from '%s'\n", __func__, path_encoder.c_str());
- fprintf(stderr, "%s: first run on a device may take a while ...\n", __func__);
+ log("%s: loading OpenVINO model from '%s'\n", __func__, path_encoder.c_str());
+ log("%s: first run on a device may take a while ...\n", __func__);
ctx->state->ctx_openvino = whisper_openvino_init(path_encoder.c_str(), device, path_cache.c_str());
if (!ctx->state->ctx_openvino) {
- fprintf(stderr, "%s: failed to init OpenVINO encoder from '%s'\n", __func__, path_encoder.c_str());
+ log("%s: failed to init OpenVINO encoder from '%s'\n", __func__, path_encoder.c_str());
return 1;
} else {
- fprintf(stderr, "%s: OpenVINO model loaded\n", __func__);
+ log("%s: OpenVINO model loaded\n", __func__);
}
return 0;
struct whisper_context * whisper_init_from_file_no_state(const char * path_model) {
- fprintf(stderr, "%s: loading model from '%s'\n", __func__, path_model);
+ log("%s: loading model from '%s'\n", __func__, path_model);
auto fin = std::ifstream(path_model, std::ios::binary);
if (!fin) {
- fprintf(stderr, "%s: failed to open '%s'\n", __func__, path_model);
+ log("%s: failed to open '%s'\n", __func__, path_model);
return nullptr;
}
buf_context ctx = { reinterpret_cast<uint8_t*>(buffer), buffer_size, 0 };
- fprintf(stderr, "%s: loading model from buffer\n", __func__);
+ log("%s: loading model from buffer\n", __func__);
whisper_model_loader loader = {};
if (!whisper_model_load(loader, *ctx)) {
loader->close(loader->context);
- fprintf(stderr, "%s: failed to load model\n", __func__);
+ log("%s: failed to load model\n", __func__);
delete ctx;
return nullptr;
}
int whisper_pcm_to_mel_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, WHISPER_N_FFT, WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
- fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
+ log("%s: failed to compute mel spectrogram\n", __func__);
return -1;
}
return whisper_pcm_to_mel_with_state(ctx, ctx->state, samples, n_samples, n_threads);
}
-// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
+// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
int whisper_pcm_to_mel_phase_vocoder_with_state(struct whisper_context * ctx, struct whisper_state * state, const float * samples, int n_samples, int n_threads) {
- if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, true, state->mel)) {
- fprintf(stderr, "%s: failed to compute mel spectrogram\n", __func__);
+ if (!log_mel_spectrogram(*state, samples, n_samples, WHISPER_SAMPLE_RATE, 2 * WHISPER_N_FFT, 2 * WHISPER_HOP_LENGTH, WHISPER_N_MEL, n_threads, ctx->model.filters, false, state->mel)) {
+ log("%s: failed to compute mel spectrogram\n", __func__);
return -1;
}
return 0;
}
-// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2
+// same as whisper_pcm_to_mel, but applies a Phase Vocoder to speed up the audio x2 (PV without phase lock is not good)
int whisper_pcm_to_mel_phase_vocoder(struct whisper_context * ctx, const float * samples, int n_samples, int n_threads) {
return whisper_pcm_to_mel_phase_vocoder_with_state(ctx, ctx->state, samples, n_samples, n_threads);
}
+// same as whisper_pcm_to_mel, but applies WSOLA to speed up the audio x2
+// TODO
+
+// same as whisper_pcm_to_mel, but applies HPTSM to speed up the audio x2
+// TODO
+
+// same as whisper_pcm_to_mel, but applies PV (with phase lock) to speed up the audio x2
+// TODO
+
int whisper_set_mel_with_state(
struct whisper_context * /*ctx*/,
struct whisper_state * state,
int n_len,
int n_mel) {
if (n_mel != WHISPER_N_MEL) {
- fprintf(stderr, "%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
+ log("%s: invalid number of mel bands: %d (expected %d)\n", __func__, n_mel, WHISPER_N_MEL);
return -1;
}
int whisper_encode_with_state(struct whisper_context * ctx, struct whisper_state * state, int offset, int n_threads) {
if (!whisper_encode_internal(*ctx, *state, offset, n_threads)) {
- fprintf(stderr, "%s: failed to eval\n", __func__);
+ log("%s: failed to eval\n", __func__);
return -1;
}
int whisper_encode(struct whisper_context * ctx, int offset, int n_threads) {
if (!whisper_encode_internal(*ctx, *ctx->state, offset, n_threads)) {
- fprintf(stderr, "%s: failed to eval\n", __func__);
+ log("%s: failed to eval\n", __func__);
return -1;
}
const int selected_decoder_id = 0;
if (!whisper_decode_internal(*ctx, *state, state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
- fprintf(stderr, "%s: failed to eval\n", __func__);
+ log("%s: failed to eval\n", __func__);
return 1;
}
const int selected_decoder_id = 0;
if (ctx->state == nullptr) {
- fprintf(stderr, "%s: ERROR state was not loaded.\n", __func__);
+ log("%s: ERROR state was not loaded.\n", __func__);
return false;
}
-
if (!whisper_decode_internal(*ctx, *ctx->state, ctx->state->decoders[selected_decoder_id], tokens, n_tokens, n_past, n_threads)) {
- fprintf(stderr, "%s: failed to eval\n", __func__);
+ log("%s: failed to eval\n", __func__);
return 1;
}
const auto res = tokenize(ctx->vocab, text);
if (n_max_tokens < (int) res.size()) {
- fprintf(stderr, "%s: too many resulting tokens: %d (max %d)\n", __func__, (int) res.size(), n_max_tokens);
+ log("%s: too many resulting tokens: %d (max %d)\n", __func__, (int) res.size(), n_max_tokens);
return -1;
}
}
}
- fprintf(stderr, "%s: unknown language '%s'\n", __func__, lang);
+ log("%s: unknown language '%s'\n", __func__, lang);
return -1;
}
return g_lang.at(lang).first;
}
}
- fprintf(stderr, "%s: unknown language id %d\n", __func__, id);
+ log("%s: unknown language id %d\n", __func__, id);
return nullptr;
}
const int seek = offset_ms/10;
if (seek < 0) {
- fprintf(stderr, "%s: offset %dms is before the start of the audio\n", __func__, offset_ms);
+ log("%s: offset %dms is before the start of the audio\n", __func__, offset_ms);
return -1;
}
if (seek >= state->mel.n_len_org) {
- fprintf(stderr, "%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len_org*10);
+ log("%s: offset %dms is past the end of the audio (%dms)\n", __func__, offset_ms, state->mel.n_len_org*10);
return -2;
}
// run the encoder
if (whisper_encode_with_state(ctx, state, seek, n_threads) != 0) {
- fprintf(stderr, "%s: failed to encode\n", __func__);
+ log("%s: failed to encode\n", __func__);
return -6;
}
const std::vector<whisper_token> prompt = { whisper_token_sot(ctx) };
if (whisper_decode_with_state(ctx, state, prompt.data(), prompt.size(), 0, n_threads) != 0) {
- fprintf(stderr, "%s: failed to decode\n", __func__);
+ log("%s: failed to decode\n", __func__);
return -7;
}
return ctx->state->logits.data();
}
-
float * whisper_get_logits_from_state(struct whisper_state * state) {
return state->logits.data();
}
void whisper_print_timings(struct whisper_context * ctx) {
const int64_t t_end_us = ggml_time_us();
- fprintf(stderr, "\n");
- fprintf(stderr, "%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
+ log("\n");
+ log("%s: load time = %8.2f ms\n", __func__, ctx->t_load_us / 1000.0f);
if (ctx->state != nullptr) {
const int32_t n_sample = std::max(1, ctx->state->n_sample);
const int32_t n_encode = std::max(1, ctx->state->n_encode);
const int32_t n_decode = std::max(1, ctx->state->n_decode);
- fprintf(stderr, "%s: fallbacks = %3d p / %3d h\n", __func__, ctx->state->n_fail_p, ctx->state->n_fail_h);
- fprintf(stderr, "%s: mel time = %8.2f ms\n", __func__, ctx->state->t_mel_us / 1000.0f);
- fprintf(stderr, "%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_sample_us, n_sample, 1e-3f * ctx->state->t_sample_us / n_sample);
- fprintf(stderr, "%s: encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_encode_us, n_encode, 1e-3f * ctx->state->t_encode_us / n_encode);
- fprintf(stderr, "%s: decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_decode_us, n_decode, 1e-3f * ctx->state->t_decode_us / n_decode);
+ log("%s: fallbacks = %3d p / %3d h\n", __func__, ctx->state->n_fail_p, ctx->state->n_fail_h);
+ log("%s: mel time = %8.2f ms\n", __func__, ctx->state->t_mel_us / 1000.0f);
+ log("%s: sample time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_sample_us, n_sample, 1e-3f * ctx->state->t_sample_us / n_sample);
+ log("%s: encode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_encode_us, n_encode, 1e-3f * ctx->state->t_encode_us / n_encode);
+ log("%s: decode time = %8.2f ms / %5d runs (%8.2f ms per run)\n", __func__, 1e-3f * ctx->state->t_decode_us, n_decode, 1e-3f * ctx->state->t_decode_us / n_decode);
}
- fprintf(stderr, "%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
+ log("%s: total time = %8.2f ms\n", __func__, (t_end_us - ctx->t_start_us)/1000.0f);
}
void whisper_reset_timings(struct whisper_context * ctx) {
s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
s += "BLAS = " + std::to_string(ggml_cpu_has_blas()) + " | ";
s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
+ s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
/*.max_tokens =*/ 0,
/*.speed_up =*/ false,
+ /*.debug_mode =*/ false,
/*.audio_ctx =*/ 0,
/*.tdrz_enable =*/ false,
WHISPER_ASSERT(n_logits == ctx.vocab.n_vocab);
// extract the logits for the last token
- // we will be mutating and therefore we don't want to use the ctx.logits buffer directly
+ // we will be mutating, and therefore we don't want to use the ctx.logits buffer directly
auto & probs = decoder.probs;
auto & logits = decoder.logits;
auto & logprobs = decoder.logprobs;
const bool last_was_timestamp = tokens_cur.size() > 0 && tokens_cur.back().id >= vocab.token_beg;
const bool penultimate_was_timestamp = tokens_cur.size() < 2 || tokens_cur[tokens_cur.size() - 2].id >= vocab.token_beg;
- //fprintf(stderr, "last_was_timestamp=%d penultimate_was_timestamp=%d\n", last_was_timestamp, penultimate_was_timestamp);
+ //log("last_was_timestamp=%d penultimate_was_timestamp=%d\n", last_was_timestamp, penultimate_was_timestamp);
if (last_was_timestamp) {
if (penultimate_was_timestamp) {
const float max_text_token_logprob = *std::max_element(logprobs.begin(), logprobs.begin() + vocab.token_beg);
- //fprintf(stderr, "timestamp_logprob=%f max_text_token_logprob=%f\n", timestamp_logprob, max_text_token_logprob);
+ //log("timestamp_logprob=%f max_text_token_logprob=%f\n", timestamp_logprob, max_text_token_logprob);
if (timestamp_logprob > max_text_token_logprob) {
for (int i = 0; i < vocab.token_beg; ++i) {
result_all.clear();
- // compute log mel spectrogram
- if (params.speed_up) {
- if (whisper_pcm_to_mel_phase_vocoder_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
- fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
+ if (n_samples > 0) {
+ // compute log mel spectrogram
+ if (params.speed_up) {
+ // TODO: Replace PV with more advanced algorithm
+ log("%s: failed to compute log mel spectrogram\n", __func__);
return -1;
- }
- } else {
- if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
- fprintf(stderr, "%s: failed to compute log mel spectrogram\n", __func__);
- return -2;
+ } else {
+ if (whisper_pcm_to_mel_with_state(ctx, state, samples, n_samples, params.n_threads) != 0) {
+ log("%s: failed to compute log mel spectrogram\n", __func__);
+ return -2;
+ }
}
}
const auto lang_id = whisper_lang_auto_detect_with_state(ctx, state, 0, params.n_threads, probs.data());
if (lang_id < 0) {
- fprintf(stderr, "%s: failed to auto-detect language\n", __func__);
+ log("%s: failed to auto-detect language\n", __func__);
return -3;
}
state->lang_id = lang_id;
params.language = whisper_lang_str(lang_id);
- fprintf(stderr, "%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
+ log("%s: auto-detected language: %s (p = %f)\n", __func__, params.language, probs[whisper_lang_id(params.language)]);
if (params.detect_language) {
return 0;
}
state->t_beg = 0;
state->t_last = 0;
state->tid_last = 0;
- state->energy = get_signal_energy(samples, n_samples, 32);
+ if (n_samples > 0) {
+ state->energy = get_signal_energy(samples, n_samples, 32);
+ }
}
const int seek_start = params.offset_ms/10;
const int seek_end = params.duration_ms == 0 ? whisper_n_len_from_state(state) : seek_start + params.duration_ms/10;
- // if length of spectrogram is less than 1s (100 samples), then return
- // basically don't process anything that is less than 1s
+ // if length of spectrogram is less than 1.0s (100 frames), then return
+ // basically don't process anything that is less than 1.0s
// see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
if (seek_end < seek_start + (params.speed_up ? 50 : 100)) {
return 0;
if (decoder.kv_self.ctx == nullptr) {
decoder.kv_self = state->decoders[0].kv_self;
if (!kv_cache_reinit(decoder.kv_self)) {
- fprintf(stderr, "%s: kv_cache_reinit() failed for self-attention, decoder %d\n", __func__, j);
+ log("%s: kv_cache_reinit() failed for self-attention, decoder %d\n", __func__, j);
return -4;
}
// overwrite audio_ctx, max allowed is hparams.n_audio_ctx
if (params.audio_ctx > whisper_n_audio_ctx(ctx)) {
- fprintf(stderr, "%s: audio_ctx is larger than the maximum allowed (%d > %d)\n", __func__, params.audio_ctx, whisper_n_audio_ctx(ctx));
+ log("%s: audio_ctx is larger than the maximum allowed (%d > %d)\n", __func__, params.audio_ctx, whisper_n_audio_ctx(ctx));
return -5;
}
state->exp_n_audio_ctx = params.audio_ctx;
}
}
- int progress_prev = 0;
- int progress_step = 5;
-
int seek = seek_start;
std::vector<whisper_token> prompt;
// main loop
while (true) {
- const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
- while (progress_cur >= progress_prev + progress_step) {
- progress_prev += progress_step;
- if (params.print_progress) {
- fprintf(stderr, "%s: progress = %3d%%\n", __func__, progress_prev);
- }
- }
if (params.progress_callback) {
+ const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
+
params.progress_callback(
- ctx, ctx->state, progress_prev, params.progress_callback_user_data);
+ ctx, ctx->state, progress_cur, params.progress_callback_user_data);
}
// of only 1 second left, then stop
if (params.encoder_begin_callback) {
if (params.encoder_begin_callback(ctx, state, params.encoder_begin_callback_user_data) == false) {
- fprintf(stderr, "%s: encoder_begin_callback returned false - aborting\n", __func__);
+ log("%s: encoder_begin_callback returned false - aborting\n", __func__);
break;
}
}
// encode audio features starting at offset seek
if (!whisper_encode_internal(*ctx, *state, seek, params.n_threads)) {
- fprintf(stderr, "%s: failed to encode\n", __func__);
+ log("%s: failed to encode\n", __func__);
return -6;
}
WHISPER_PRINT_DEBUG("\n\n");
if (!whisper_decode_internal(*ctx, *state, state->decoders[0], prompt.data(), prompt.size(), 0, params.n_threads)) {
- fprintf(stderr, "%s: failed to decode\n", __func__);
+ log("%s: failed to decode\n", __func__);
return -7;
}
//WHISPER_PRINT_DEBUG("%s: decoder %d: token %d, kv_self.n %d, seek_delta %d\n", __func__, j, decoder.tokens_tmp[0], decoder.kv_self.n, decoder.seek_delta);
if (!whisper_decode_internal(*ctx, *state, decoder, decoder.tokens_tmp.data(), decoder.tokens_tmp.size(), decoder.kv_self.n, params.n_threads)) {
- fprintf(stderr, "%s: failed to decode\n", __func__);
+ log("%s: failed to decode\n", __func__);
return -8;
}
return 0;
}
-
int whisper_full(
struct whisper_context * ctx,
struct whisper_full_params params,
result.t0 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
result.t1 += 100 * ((i + 1) * n_samples_per_processor) / WHISPER_SAMPLE_RATE + offset_t;
-
// make sure that segments are not overlapping
if (!ctx->state->result_all.empty()) {
result.t0 = std::max(result.t0, ctx->state->result_all.back().t1);
ctx->state->t_decode_us /= n_processors;
// print information about the audio boundaries
- fprintf(stderr, "\n");
- fprintf(stderr, "%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors);
+ log("\n");
+ log("%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors);
for (int i = 0; i < n_processors - 1; ++i) {
- fprintf(stderr, "%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str());
+ log("%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str());
}
- fprintf(stderr, "%s: the transcription quality may be degraded near these boundaries\n", __func__);
+ log("%s: the transcription quality may be degraded near these boundaries\n", __func__);
return ret;
}
const int n_samples = state.energy.size();
if (n_samples == 0) {
- fprintf(stderr, "%s: no signal data available\n", __func__);
+ log("%s: no signal data available\n", __func__);
return;
}
// }
//}
}
+
+void whisper_set_log_callback(whisper_log_callback callback) {
+ whisper_log = callback;
+}