bool flash_attn = false;
bool suppress_nst = false;
bool no_context = false;
+ bool no_language_probabilities = false;
std::string language = "en";
std::string prompt = "";
fprintf(stderr, " -nc, --no-context [%-7s] do not use previous audio context\n", params.no_context ? "true" : "false");
fprintf(stderr, " -ng, --no-gpu [%-7s] do not use gpu\n", params.use_gpu ? "false" : "true");
fprintf(stderr, " -fa, --flash-attn [%-7s] flash attention\n", params.flash_attn ? "true" : "false");
+ fprintf(stderr, " -nlp, --no-language-probabilities [%-7s] exclude language probabilities from verbose_json output\n", params.no_language_probabilities ? "true" : "false");
// Voice Activity Detection (VAD) parameters
fprintf(stderr, "\nVoice Activity Detection (VAD) options:\n");
fprintf(stderr, " --vad [%-7s] enable Voice Activity Detection (VAD)\n", params.vad ? "true" : "false");
else if (arg == "-sns" || arg == "--suppress-nst") { params.suppress_nst = true; }
else if (arg == "-nth" || arg == "--no-speech-thold") { params.no_speech_thold = std::stof(argv[++i]); }
else if (arg == "-nc" || arg == "--no-context") { params.no_context = true; }
+ else if (arg == "-nlp" || arg == "--no-language-probabilities") { params.no_language_probabilities = true; }
// server params
else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
{
params.vad_samples_overlap = std::stof(req.get_file_value("vad_samples_overlap").content);
}
+ if (req.has_file("no_language_probabilities"))
+ {
+ params.no_language_probabilities = parse_str_to_bool(req.get_file_value("no_language_probabilities").content);
+ }
}
} // namespace
} else if (params.response_format == vjson_format) {
/* try to match openai/whisper's Python format */
std::string results = output_str(ctx, params, pcmf32s);
- // Get language probabilities
- std::vector<float> lang_probs(whisper_lang_max_id() + 1, 0.0f);
- const auto detected_lang_id = whisper_lang_auto_detect(ctx, 0, params.n_threads, lang_probs.data());
json jres = json{
{"task", params.translate ? "translate" : "transcribe"},
{"language", whisper_lang_str_full(whisper_full_lang_id(ctx))},
{"duration", float(pcmf32.size())/WHISPER_SAMPLE_RATE},
{"text", results},
- {"segments", json::array()},
- {"detected_language", whisper_lang_str_full(detected_lang_id)},
- {"detected_language_probability", lang_probs[detected_lang_id]},
- {"language_probabilities", json::object()}
+ {"segments", json::array()}
};
- // Add all language probabilities
- for (int i = 0; i <= whisper_lang_max_id(); ++i) {
- if (lang_probs[i] > 0.001f) { // Only include non-negligible probabilities
- jres["language_probabilities"][whisper_lang_str(i)] = lang_probs[i];
+ // Only compute language probabilities if requested (expensive operation)
+ if (!params.no_language_probabilities) {
+ std::vector<float> lang_probs(whisper_lang_max_id() + 1, 0.0f);
+ const auto detected_lang_id = whisper_lang_auto_detect(ctx, 0, params.n_threads, lang_probs.data());
+ jres["detected_language"] = whisper_lang_str_full(detected_lang_id);
+ jres["detected_language_probability"] = lang_probs[detected_lang_id];
+ jres["language_probabilities"] = json::object();
+ // Add all language probabilities
+ for (int i = 0; i <= whisper_lang_max_id(); ++i) {
+ if (lang_probs[i] > 0.001f) { // Only include non-negligible probabilities
+ jres["language_probabilities"][whisper_lang_str(i)] = lang_probs[i];
+ }
}
}
const int n_segments = whisper_full_n_segments(ctx);