#include "whisper.h"
-#include "ggml-cpu.h"
-
#include "ggml.h"
+#include "ggml-cpp.h"
#include "ggml-alloc.h"
#include "ggml-backend.h"
#include <cassert>
#define _USE_MATH_DEFINES
#include <cmath>
-#include <cstdio>
+#include <codecvt>
#include <cstdarg>
+#include <cstdio>
#include <cstring>
#include <fstream>
+#include <functional>
#include <map>
+#include <mutex>
+#include <random>
+#include <regex>
#include <set>
#include <string>
#include <thread>
#include <vector>
-#include <regex>
-#include <random>
-#include <functional>
-#include <codecvt>
// dummy
static bool ggml_graph_compute_helper(
struct ggml_cgraph * graph,
- std::vector<uint8_t> & buf,
int n_threads,
ggml_abort_callback abort_callback,
void * abort_callback_data) {
- struct ggml_cplan plan = ggml_graph_plan(graph, n_threads, nullptr);
- plan.abort_callback = abort_callback;
- plan.abort_callback_data = abort_callback_data;
+ ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
- if (plan.work_size > 0) {
- buf.resize(plan.work_size);
- plan.work_data = buf.data();
+ auto * reg = ggml_backend_dev_backend_reg(ggml_backend_get_device(backend.get()));
+
+ auto * set_abort_callback_fn = (ggml_backend_set_abort_callback_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_abort_callback");
+ if (set_abort_callback_fn) {
+ set_abort_callback_fn(backend.get(), abort_callback, abort_callback_data);
+ }
+
+ auto ggml_backend_set_n_threads_fn = (ggml_backend_set_n_threads_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_set_n_threads");
+ if (ggml_backend_set_n_threads_fn) {
+ ggml_backend_set_n_threads_fn(backend.get(), n_threads);
}
- return ggml_graph_compute(graph, &plan);
+ return ggml_backend_graph_compute(backend.get(), graph) == GGML_STATUS_SUCCESS;
}
static bool ggml_graph_compute_helper(
return t;
}
+static void whisper_load_backends() {
+#ifdef GGML_BACKEND_DL
+ static std::once_flag flag;
+ std::call_once(flag, []() {
+ ggml_backend_load_all();
+ });
+#endif
+}
+
+// TODO: move these functions to ggml-base with support for ggml-backend?
+
+static ggml_tensor * whisper_set_f32(struct ggml_tensor * t, float v) {
+ GGML_ASSERT(t->type == GGML_TYPE_F32);
+ GGML_ASSERT(ggml_is_contiguous(t));
+ size_t nels = ggml_nelements(t);
+ for (int64_t i = 0; i < nels; ++i) {
+ ((float *) t->data)[i] = v;
+ }
+ return t;
+}
+
+static ggml_tensor * whisper_set_i32(struct ggml_tensor * t, int32_t v) {
+ GGML_ASSERT(t->type == GGML_TYPE_I32);
+ GGML_ASSERT(ggml_is_contiguous(t));
+ size_t nels = ggml_nelements(t);
+ for (int64_t i = 0; i < nels; ++i) {
+ ((int32_t *) t->data)[i] = v;
+ }
+ return t;
+}
+
+static float whisper_get_f32_nd(const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+ GGML_ASSERT(t->type == GGML_TYPE_F32);
+ void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
+ return *(float *) data;
+}
+
+static void whisper_set_f32_nd(struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, float v) {
+ GGML_ASSERT(t->type == GGML_TYPE_F32);
+ void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
+ *(float *) data = v;
+}
+
+static int32_t whisper_get_i32_nd(const struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3) {
+ GGML_ASSERT(t->type == GGML_TYPE_I32);
+ void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
+ return *(int32_t *) data;
+}
+
+static void whisper_set_i32_nd(struct ggml_tensor * t, int64_t i0, int64_t i1, int64_t i2, int64_t i3, int32_t v) {
+ GGML_ASSERT(t->type == GGML_TYPE_I32);
+ void * data = (char *) t->data + i0*t->nb[0] + i1*t->nb[1] + i2*t->nb[2] + i3*t->nb[3];
+ *(int32_t *) data = v;
+}
+
// faster matrix multiplications for tensors that do not have dimension 0 divisible by "pad"
// the idea is to represent the original matrix multiplication:
//
static ggml_backend_t whisper_backend_init_gpu(const whisper_context_params & params) {
ggml_log_set(g_state.log_callback, g_state.log_callback_user_data);
+ whisper_load_backends();
+
ggml_backend_dev_t dev = nullptr;
int cnt = 0;
GGML_UNUSED(params);
- result.push_back(ggml_backend_cpu_init());
+ result.push_back(ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr));
return result;
}
const char * whisper_print_system_info(void) {
static std::string s;
+ whisper_load_backends();
+
s = "";
- s += "AVX = " + std::to_string(ggml_cpu_has_avx()) + " | ";
- s += "AVX2 = " + std::to_string(ggml_cpu_has_avx2()) + " | ";
- s += "AVX512 = " + std::to_string(ggml_cpu_has_avx512()) + " | ";
- s += "FMA = " + std::to_string(ggml_cpu_has_fma()) + " | ";
- s += "NEON = " + std::to_string(ggml_cpu_has_neon()) + " | ";
- s += "ARM_FMA = " + std::to_string(ggml_cpu_has_arm_fma()) + " | ";
- s += "F16C = " + std::to_string(ggml_cpu_has_f16c()) + " | ";
- s += "FP16_VA = " + std::to_string(ggml_cpu_has_fp16_va()) + " | ";
- s += "WASM_SIMD = " + std::to_string(ggml_cpu_has_wasm_simd()) + " | ";
- s += "SSE3 = " + std::to_string(ggml_cpu_has_sse3()) + " | ";
- s += "SSSE3 = " + std::to_string(ggml_cpu_has_ssse3()) + " | ";
- s += "VSX = " + std::to_string(ggml_cpu_has_vsx()) + " | ";
+ s += "WHISPER : ";
s += "COREML = " + std::to_string(whisper_has_coreml()) + " | ";
s += "OPENVINO = " + std::to_string(whisper_has_openvino()) + " | ";
+ for (size_t i = 0; i < ggml_backend_reg_count(); i++) {
+ auto * reg = ggml_backend_reg_get(i);
+ auto * get_features_fn = (ggml_backend_get_features_t) ggml_backend_reg_get_proc_address(reg, "ggml_backend_get_features");
+ if (get_features_fn) {
+ ggml_backend_feature * features = get_features_fn(reg);
+ s += ggml_backend_reg_name(reg);
+ s += " : ";
+ for (; features->name; features++) {
+ s += features->name;
+ s += " = ";
+ s += features->value;
+ s += " | ";
+ }
+ }
+ }
return s.c_str();
}
}
WHISPER_API const char * whisper_bench_ggml_mul_mat_str(int n_threads) {
+ whisper_load_backends();
+
static std::string s;
s = "";
char strbuf[256];
// c: N*N*sizeof(float)
// when F16 is used, there is an extra work buffer of size N*N*sizeof(float)
std::vector<uint8_t> buf(3llu*N_max*N_max*sizeof(float) + 3*ggml_tensor_overhead() + ggml_graph_overhead());
- std::vector<uint8_t> work;
// put a bunch of random data in the buffer
for (size_t i = 0; i < buf.size(); i++) buf[i] = i;
double tsum = 0.0;
// heat-up
- ggml_graph_compute_helper(gf, work, n_threads, nullptr, nullptr);
+ ggml_graph_compute_helper(gf, n_threads, nullptr, nullptr);
for (int i = 0; i < n_max; ++i) {
const int64_t t0 = ggml_time_us();
- ggml_graph_compute_helper(gf, work, n_threads, nullptr, nullptr);
+ ggml_graph_compute_helper(gf, n_threads, nullptr, nullptr);
const int64_t t1 = ggml_time_us();
struct ggml_tensor * cost = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, N + 1, M + 1);
struct ggml_tensor * trace = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, N + 1, M + 1);
- cost = ggml_set_f32(cost, INFINITY);
- trace = ggml_set_f32(trace, -1);
- ggml_set_f32_nd(cost, 0, 0, 0, 0, 0.0);
+ cost = whisper_set_f32(cost, INFINITY);
+ trace = whisper_set_i32(trace, -1);
+ whisper_set_f32_nd(cost, 0, 0, 0, 0, 0.0);
// dtw
// supposedly can be optmized by computing diagonals in parallel ?
// Not sure it is worth it since x will be GENERATED_TOKENS*1500 size at most.
for (int64_t j = 1; j < M + 1; ++j) {
for (int64_t i = 1; i < N + 1; ++i) {
- float c0 = ggml_get_f32_nd(cost, i - 1, j - 1, 0, 0);
- float c1 = ggml_get_f32_nd(cost, i - 1, j, 0, 0);
- float c2 = ggml_get_f32_nd(cost, i, j - 1, 0, 0);
+ float c0 = whisper_get_f32_nd(cost, i - 1, j - 1, 0, 0);
+ float c1 = whisper_get_f32_nd(cost, i - 1, j, 0, 0);
+ float c2 = whisper_get_f32_nd(cost, i, j - 1, 0, 0);
float c;
int32_t t;
t = 2;
}
- c = ggml_get_f32_nd(x, i - 1, j - 1, 0, 0) + c;
- ggml_set_f32_nd(cost, i, j, 0, 0, c);
- ggml_set_i32_nd(trace, i, j, 0, 0, t);
+ c = whisper_get_f32_nd(x, i - 1, j - 1, 0, 0) + c;
+ whisper_set_f32_nd(cost, i, j, 0, 0, c);
+ whisper_set_i32_nd(trace, i, j, 0, 0, t);
}
}
struct ggml_tensor * bt = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, BT_MAX_ROWS, 2);
// trace[0, :] = 2;
for (int64_t i = 0; i < M + 1; ++i)
- ggml_set_i32_nd(trace, 0, i, 0, 0, 2);
+ whisper_set_i32_nd(trace, 0, i, 0, 0, 2);
//trace[:, 0] = 1;
for (int64_t i = 0; i < N + 1; ++i)
- ggml_set_i32_nd(trace, i, 0, 0, 0, 1);
+ whisper_set_i32_nd(trace, i, 0, 0, 0, 1);
int bt_row_idx = BT_MAX_ROWS - 1;
int64_t i = N;
int64_t j = M;
while (i > 0 || j > 0) {
- ggml_set_i32_nd(bt, bt_row_idx, 0, 0, 0, i - 1);
- ggml_set_i32_nd(bt, bt_row_idx, 1, 0, 0, j - 1);
+ whisper_set_i32_nd(bt, bt_row_idx, 0, 0, 0, i - 1);
+ whisper_set_i32_nd(bt, bt_row_idx, 1, 0, 0, j - 1);
--bt_row_idx;
- int32_t t = ggml_get_i32_nd(trace, i, j, 0, 0);
+ int32_t t = whisper_get_i32_nd(trace, i, j, 0, 0);
if (t == 0) {
--i;
--j;
ggml_tensor * r = ggml_new_tensor_2d(ctx, GGML_TYPE_I32, 2, result_n_cols);
for (int64_t i = 0; i < 2; ++i) {
for (int64_t j = 0; j < result_n_cols; ++j) {
- int32_t v = ggml_get_i32_nd(bt, j+bt_row_idx+1, i, 0, 0);
- ggml_set_i32_nd(r, i, j, 0, 0, v);
+ int32_t v = whisper_get_i32_nd(bt, j+bt_row_idx+1, i, 0, 0);
+ whisper_set_i32_nd(r, i, j, 0, 0, v);
}
}
idx = 2*(a->ne[2] - 1) - idx;
}
- filter.push_back(ggml_get_f32_nd(a, i, j, idx, 0));
+ filter.push_back(whisper_get_f32_nd(a, i, j, idx, 0));
}
std::sort(filter.begin(), filter.end());
const float v = filter[filter.size()/2];
- ggml_set_f32_nd(dst, i, j, k, 0, v);
+ whisper_set_f32_nd(dst, i, j, k, 0, v);
filter.clear();
}
}
// Compute
struct ggml_cgraph * gf = ggml_new_graph(gctx);
ggml_build_forward_expand(gf, w);
- ggml_graph_compute_with_ctx(gctx, gf, n_threads);
+
+ ggml_backend_ptr backend { ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr) };
+ ggml_backend_graph_compute(backend.get(), gf);
ggml_tensor * alignment = dtw_and_backtrace(gctx, w);
auto seg_i = state->result_all.begin() + i_segment;
auto tok_i = seg_i->tokens.begin();
for (int i = 0; i < alignment->ne[1]; ++i) {
- int32_t v = ggml_get_i32_nd(alignment, 0, i, 0, 0);
+ int32_t v = whisper_get_i32_nd(alignment, 0, i, 0, 0);
if (v != last_v) {
- int32_t time_index = ggml_get_i32_nd(alignment, 1, i, 0, 0);
+ int32_t time_index = whisper_get_i32_nd(alignment, 1, i, 0, 0);
int64_t timestamp = (time_index * 2) + seek; // Each index on DTW result = 20mS audio
last_v = v;