int32_t n_processors = 1;
int32_t offset_t_ms = 0;
int32_t offset_n = 0;
+ int32_t duration_ms = 0;
int32_t max_context = -1;
int32_t max_len = 0;
params.offset_t_ms = std::stoi(argv[++i]);
} else if (arg == "-on" || arg == "--offset-n") {
params.offset_n = std::stoi(argv[++i]);
+ } else if (arg == "-d" || arg == "--duration") {
+ params.duration_ms = std::stoi(argv[++i]);
} else if (arg == "-mc" || arg == "--max-context") {
params.max_context = std::stoi(argv[++i]);
} else if (arg == "-ml" || arg == "--max-len") {
fprintf(stderr, " -p N, --processors N number of processors to use during computation (default: %d)\n", params.n_processors);
fprintf(stderr, " -ot N, --offset-t N time offset in milliseconds (default: %d)\n", params.offset_t_ms);
fprintf(stderr, " -on N, --offset-n N segment index offset (default: %d)\n", params.offset_n);
+ fprintf(stderr, " -d N, --duration N duration of audio to process in milliseconds (default: %d)\n", params.duration_ms);
fprintf(stderr, " -mc N, --max-context N maximum number of text context tokens to store (default: max)\n");
fprintf(stderr, " -ml N, --max-len N maximum segment length in characters (default: %d)\n", params.max_len);
fprintf(stderr, " -wt N, --word-thold N word timestamp probability threshold (default: %f)\n", params.word_thold);
wparams.n_threads = params.n_threads;
wparams.n_max_text_ctx = params.max_context >= 0 ? params.max_context : wparams.n_max_text_ctx;
wparams.offset_ms = params.offset_t_ms;
+ wparams.duration_ms = params.duration_ms;
wparams.token_timestamps = params.output_wts || params.max_len > 0;
wparams.thold_pt = params.word_thold;
/*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
/*.n_max_text_ctx =*/ 16384,
/*.offset_ms =*/ 0,
+ /*.duration_ms =*/ 0,
/*.translate =*/ false,
/*.no_context =*/ false,
/*.n_threads =*/ std::min(4, (int32_t) std::thread::hardware_concurrency()),
/*.n_max_text_ctx =*/ 16384,
/*.offset_ms =*/ 0,
+ /*.duration_ms =*/ 0,
/*.translate =*/ false,
/*.no_context =*/ false,
}
const int seek_start = params.offset_ms/10;
+ const int seek_end = seek_start + (params.duration_ms == 0 ? whisper_n_len(ctx) : params.duration_ms/10);
// if length of spectrogram is less than 1s (100 samples), then return
// basically don't process anything that is less than 1s
// see issue #39: https://github.com/ggerganov/whisper.cpp/issues/39
- if (whisper_n_len(ctx) < 100 + seek_start) {
+ if (seek_end < 100 + seek_start) {
return 0;
}
// main loop
int seek = seek_start;
while (true) {
- int progress_cur = (100*seek)/whisper_n_len(ctx);
+ const int progress_cur = (100*(seek - seek_start))/(seek_end - seek_start);
while (progress_cur >= progress_prev + progress_step) {
progress_prev += progress_step;
if (params.print_progress) {
}
}
- if (seek + 100 >= whisper_n_len(ctx)) {
+ if (seek + 100 >= seek_end) {
break;
}
// end of text token
if (token.id == whisper_token_eot(ctx)) {
if (result_len == 0) {
- if (seek + seek_delta + 100 >= whisper_n_len(ctx)) {
+ if (seek + seek_delta + 100 >= seek_end) {
result_len = i + 1;
} else {
// TODO: figure out how to resolve this