commandset_list.push_back(cs);
return json{{"index",index}};
}
-json seek(struct whisper_context * ctx, audio_async &audio, json params) {
+json seek(struct whisper_context * /*ctx*/, audio_async & /*audio*/, json /*params*/) {
// whisper_state has the pertinent offsets, but there also seem to be a large
// number of scratch buffers that would prevent rewinding context in a manner similar to llama
// I'll give this a another pass once everything else is implemented,
// but for now, it's unsupported
- throw json{
+ throw json {
{"code", -32601},
{"message", "Seeking is not yet supported."}
};
jobqueue.pop_front();
// send response
std::string data = resp.dump(-1, ' ', false, json::error_handler_t::replace);
- fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", data.length()+1, data.c_str());
+ fprintf(stdout, "Content-Length: %d\r\n\r\n%s\n", (int)data.length()+1, data.c_str());
std::cout.flush();
}
return speaker;
}
-void whisper_print_progress_callback(struct whisper_context * ctx, struct whisper_state * /*state*/, int progress, void * user_data) {
+void whisper_print_progress_callback(struct whisper_context * /*ctx*/, struct whisper_state * /*state*/, int progress, void * user_data) {
int progress_step = ((whisper_print_user_data *) user_data)->params->progress_step;
int * progress_prev = &(((whisper_print_user_data *) user_data)->progress_prev);
if (progress >= *progress_prev + progress_step) {
return true;
}
-bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+bool output_score(struct whisper_context * ctx, const char * fname, const whisper_params & /*params*/, std::vector<std::vector<float>> /*pcmf32s*/) {
std::ofstream fout(fname);
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
const llama_token * tokens,
const int n_tokens,
const int n_past,
- const int n_threads) {
+ int n_threads) {
// enforce that the first token is BOS
if (n_past == 0 && tokens[0] != llama_token_bos()) {
const int n_vocab = hparams.n_vocab;
const int n_rot = hparams.n_embd/hparams.n_head;
+ const float eps = 5e-6f; // TODO: take from hparams
+
auto & mem_per_token = lctx.mem_per_token;
auto & buf_compute = lctx.buf_compute;
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ggml_cgraph gf = {};
- gf.n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
+ n_threads = N >= 32 && ggml_cpu_has_blas() && !ggml_cpu_has_gpublas() ? 1 : n_threads;
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
ggml_set_name(embd, "embd");
// norm
{
- cur = ggml_rms_norm(ctx0, inpL);
+ cur = ggml_rms_norm(ctx0, inpL, eps);
// cur = cur*attention_norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.layers[il].attention_norm);
{
// norm
{
- cur = ggml_rms_norm(ctx0, inpFF);
+ cur = ggml_rms_norm(ctx0, inpFF, eps);
// cur = cur*ffn_norm(broadcasted)
cur = ggml_mul(ctx0, cur, model.layers[il].ffn_norm);
// norm
{
- inpL = ggml_rms_norm(ctx0, inpL);
+ inpL = ggml_rms_norm(ctx0, inpL, eps);
// inpL = inpL*norm(broadcasted)
inpL = ggml_mul(ctx0, inpL, model.norm);
//inpL = ggml_soft_max_inplace(ctx0, inpL);
// run the computation
- ggml_build_forward_expand(&gf, inpL);
- ggml_graph_compute (ctx0, &gf);
+ ggml_build_forward_expand (&gf, inpL);
+ ggml_graph_compute_with_ctx(ctx0, &gf, n_threads);
#ifdef GGML_PERF
// print timing information per ggml operation (for debugging purposes)
}
struct ggml_cgraph gf = ggml_build_forward(r);
- gf.n_threads = n_threads;
- ggml_graph_compute(lora_ctx, &gf);
+ ggml_graph_compute_with_ctx(lora_ctx, &gf, n_threads);
// we won't need these tensors again, reset the context to save memory
ggml_free(lora_ctx);
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
ggml_cgraph gf{};
- gf.n_threads = 1;
ggml_tensor * kout3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
kout3d->data = out;
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, k3d, kout3d));
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, v3d, vout3d));
- ggml_graph_compute(cpy_ctx, &gf);
+ ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);
ggml_free(cpy_ctx);
}
ggml_context * cpy_ctx = ggml_init({ sizeof(buffer), buffer, /* no_alloc */ true });
ggml_cgraph gf{};
- gf.n_threads = 1;
ggml_tensor * kin3d = ggml_new_tensor_3d(cpy_ctx, kv_self.k->type, n_embd, kv_ntok, n_layer);
kin3d->data = (void *) inp;
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, kin3d, k3d));
ggml_build_forward_expand(&gf, ggml_cpy(cpy_ctx, vin3d, v3d));
- ggml_graph_compute(cpy_ctx, &gf);
+ ggml_graph_compute_with_ctx(cpy_ctx, &gf, 1);
ggml_free(cpy_ctx);
}