std::vector<float> pcmf32(n_samples_30s, 0.0f);
std::vector<float> pcmf32_old;
+ std::vector<whisper_token> prompt_tokens;
const int n_new_line = params.length_ms / params.step_ms - 1;
// print some info about the processing
wparams.audio_ctx = params.audio_ctx;
wparams.speed_up = params.speed_up;
+ wparams.prompt_tokens = prompt_tokens.data();
+ wparams.prompt_n_tokens = prompt_tokens.size();
+
if (whisper_full(ctx, wparams, pcmf32.data(), pcmf32.size()) != 0) {
fprintf(stderr, "%s: failed to process audio\n", argv[0]);
return 6;
// keep part of the audio for next iteration to try to mitigate word boundary issues
pcmf32_old = std::vector<float>(pcmf32.end() - n_samples_keep, pcmf32.end());
+
+ // Add tokens of the last full length segment as the prompt
+ prompt_tokens.clear();
+ const int n_segments = whisper_full_n_segments(ctx);
+ for (int i = 0; i < n_segments; ++i) {
+ const int token_count = whisper_full_n_tokens(ctx, i);
+ for (int j = 0; j < token_count; ++j) {
+ prompt_tokens.push_back(whisper_full_get_token_id(ctx, i, j));
+ }
+ }
}
}
}
/*.speed_up =*/ false,
/*.audio_ctx =*/ 0,
+ /*.prompt_tokens =*/ nullptr,
+ /*.prompt_n_tokens =*/ 0,
+
/*.language =*/ "en",
/*.greedy =*/ {
/*.speed_up =*/ false,
/*.audio_ctx =*/ 0,
+ /*.prompt_tokens =*/ nullptr,
+ /*.prompt_n_tokens =*/ 0,
+
/*.language =*/ "en",
/*.greedy =*/ {
prompt_past.clear();
}
+ // Prepend the prompt tokens to the prompt_past
+ if (params.prompt_tokens && params.prompt_n_tokens > 0) {
+ // Parse tokens from the pointer (it points to an std::vector)
+ for (int i = 0; i < params.prompt_n_tokens; i++) {
+ prompt_past.push_back(params.prompt_tokens[i]);
+ }
+ std::rotate(prompt_past.begin(), prompt_past.end() - params.prompt_n_tokens, prompt_past.end());
+ }
+
// overwrite audio_ctx
ctx->exp_n_audio_ctx = params.audio_ctx;
bool speed_up; // speed-up the audio by 2x using Phase Vocoder
int audio_ctx; // overwrite the audio context size (0 = use default)
+ // std::vector<whisper_token>: tokens to provide the whisper model as initial prompt
+ const whisper_token * prompt_tokens;
+ int prompt_n_tokens;
+
const char * language;
struct {