}\r
\r
/** Flag to suppress non-speech tokens. */\r
- public CBool suppress_non_speech_tokens;\r
+ public CBool suppress_nst;\r
\r
/** Flag to suppress non-speech tokens. */\r
public void suppressNonSpeechTokens(boolean enable) {\r
- suppress_non_speech_tokens = enable ? CBool.TRUE : CBool.FALSE;\r
+ suppress_nst = enable ? CBool.TRUE : CBool.FALSE;\r
}\r
\r
/** Initial decoding temperature. */\r
"print_special", "print_progress", "print_realtime", "print_timestamps", "token_timestamps",\r
"thold_pt", "thold_ptsum", "max_len", "split_on_word", "max_tokens", "audio_ctx",\r
"tdrz_enable", "suppress_regex", "initial_prompt", "prompt_tokens", "prompt_n_tokens", "language", "detect_language",\r
- "suppress_blank", "suppress_non_speech_tokens", "temperature", "max_initial_ts", "length_penalty",\r
+ "suppress_blank", "suppress_nst", "temperature", "max_initial_ts", "length_penalty",\r
"temperature_inc", "entropy_thold", "logprob_thold", "no_speech_thold", "greedy", "beam_search",\r
"new_segment_callback", "new_segment_callback_user_data",\r
"progress_callback", "progress_callback_user_data",\r
}
/*
* call-seq:
- * suppress_non_speech_tokens = force_suppress -> force_suppress
+ * suppress_nst = force_suppress -> force_suppress
*/
-static VALUE ruby_whisper_params_set_suppress_non_speech_tokens(VALUE self, VALUE value) {
- BOOL_PARAMS_SETTER(self, suppress_non_speech_tokens, value)
+static VALUE ruby_whisper_params_set_suppress_nst(VALUE self, VALUE value) {
+ BOOL_PARAMS_SETTER(self, suppress_nst, value)
}
/*
* If true, suppresses non-speech-tokens.
*
* call-seq:
- * suppress_non_speech_tokens -> bool
+ * suppress_nst -> bool
*/
-static VALUE ruby_whisper_params_get_suppress_non_speech_tokens(VALUE self) {
- BOOL_PARAMS_GETTER(self, suppress_non_speech_tokens)
+static VALUE ruby_whisper_params_get_suppress_nst(VALUE self) {
+ BOOL_PARAMS_GETTER(self, suppress_nst)
}
/*
* If true, enables token-level timestamps.
rb_define_method(cParams, "print_timestamps=", ruby_whisper_params_set_print_timestamps, 1);
rb_define_method(cParams, "suppress_blank", ruby_whisper_params_get_suppress_blank, 0);
rb_define_method(cParams, "suppress_blank=", ruby_whisper_params_set_suppress_blank, 1);
- rb_define_method(cParams, "suppress_non_speech_tokens", ruby_whisper_params_get_suppress_non_speech_tokens, 0);
- rb_define_method(cParams, "suppress_non_speech_tokens=", ruby_whisper_params_set_suppress_non_speech_tokens, 1);
+ rb_define_method(cParams, "suppress_nst", ruby_whisper_params_get_suppress_nst, 0);
+ rb_define_method(cParams, "suppress_nst=", ruby_whisper_params_set_suppress_nst, 1);
rb_define_method(cParams, "token_timestamps", ruby_whisper_params_get_token_timestamps, 0);
rb_define_method(cParams, "token_timestamps=", ruby_whisper_params_set_token_timestamps, 1);
rb_define_method(cParams, "split_on_word", ruby_whisper_params_get_split_on_word, 0);
assert !@params.suppress_blank
end
- def test_suppress_non_speech_tokens
- @params.suppress_non_speech_tokens = true
- assert @params.suppress_non_speech_tokens
- @params.suppress_non_speech_tokens = false
- assert !@params.suppress_non_speech_tokens
+ def test_suppress_nst
+ @params.suppress_nst = true
+ assert @params.suppress_nst
+ @params.suppress_nst = false
+ assert !@params.suppress_nst
end
def test_token_timestamps
wparams.n_threads = params.n_threads;
wparams.audio_ctx = params.audio_ctx;
- wparams.suppress_non_speech_tokens = true;
+ wparams.suppress_nst = true;
// run the transformer and a single decoding pass
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
fprintf(stderr, "%s: ERROR: whisper_full() failed\n", __func__);
wparams.prompt_tokens = cs.prompt_tokens.data();
wparams.prompt_n_tokens = cs.prompt_tokens.size();
// TODO: properly expose as option
- wparams.suppress_non_speech_tokens = true;
+ wparams.suppress_nst = true;
// run the transformer and a single decoding pass
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
bool no_timestamps = false;
bool use_gpu = true;
bool flash_attn = false;
- bool suppress_non_speech_tokens = false;
+ bool suppress_nst = false;
std::string language = "en";
std::string prompt = "";
fprintf(stderr, " --request-path PATH, [%-7s] Request path for all requests\n", sparams.request_path.c_str());
fprintf(stderr, " --inference-path PATH, [%-7s] Inference path for all requests\n", sparams.inference_path.c_str());
fprintf(stderr, " --convert, [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false");
- fprintf(stderr, " -sns, --suppress-non-speech [%-7s] suppress non-speech tokens\n", params.suppress_non_speech_tokens ? "true" : "false");
+ fprintf(stderr, " -sns, --suppress-nst [%-7s] suppress non-speech tokens\n", params.suppress_nst ? "true" : "false");
fprintf(stderr, "\n");
}
else if (arg == "-dtw" || arg == "--dtw") { params.dtw = argv[++i]; }
else if (arg == "-ng" || arg == "--no-gpu") { params.use_gpu = false; }
else if (arg == "-fa" || arg == "--flash-attn") { params.flash_attn = true; }
- else if (arg == "-sns" || arg == "--suppress-non-speech") { params.suppress_non_speech_tokens = true; }
+ else if (arg == "-sns" || arg == "--suppress-nst") { params.suppress_nst = true; }
// server params
else if ( arg == "--port") { sparams.port = std::stoi(argv[++i]); }
else if ( arg == "--host") { sparams.hostname = argv[++i]; }
}
if (req.has_file("suppress_non_speech"))
{
- params.suppress_non_speech_tokens = parse_str_to_bool(req.get_file_value("suppress_non_speech").content);
+ params.suppress_nst = parse_str_to_bool(req.get_file_value("suppress_non_speech").content);
+ }
+ if (req.has_file("suppress_nst"))
+ {
+ params.suppress_nst = parse_str_to_bool(req.get_file_value("suppress_nst").content);
}
}
wparams.no_timestamps = params.no_timestamps;
wparams.token_timestamps = !params.no_timestamps && params.response_format == vjson_format;
- wparams.suppress_non_speech_tokens = params.suppress_non_speech_tokens;
+ wparams.suppress_nst = params.suppress_nst;
whisper_print_user_data user_data = { ¶ms, &pcmf32s, 0 };
bool detect_language;
// common decoding parameters:
- bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
- bool suppress_non_speech_tokens; // ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
+ bool suppress_blank; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L89
+ bool suppress_nst; // non-speech tokens, ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
float temperature; // initial decoding temperature, ref: https://ai.stackexchange.com/a/32478
float max_initial_ts; // ref: https://github.com/openai/whisper/blob/f82bc59f5ea234d4b97fb2860842ed38519f7e65/whisper/decoding.py#L97
/*.detect_language =*/ false,
/*.suppress_blank =*/ true,
- /*.suppress_non_speech_tokens =*/ false,
+ /*.suppress_nst =*/ false,
/*.temperature =*/ 0.0f,
/*.max_initial_ts =*/ 1.0f,
// suppress non-speech tokens
// ref: https://github.com/openai/whisper/blob/7858aa9c08d98f75575035ecd6481f462d66ca27/whisper/tokenizer.py#L224-L253
- if (params.suppress_non_speech_tokens) {
+ if (params.suppress_nst) {
for (const std::string & token : non_speech_tokens) {
const std::string suppress_tokens[] = {token, " " + token};
for (const std::string & suppress_token : suppress_tokens) {