bool output_wts = false;
bool output_csv = false;
bool output_jsn = false;
+ bool output_jsn_full = false;
bool output_lrc = false;
bool print_special = false;
bool print_colors = false;
else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
else if (arg == "-oj" || arg == "--output-json") { params.output_jsn = true; }
+ else if (arg == "-ojf" || arg == "--output-json-full"){ params.output_jsn_full = params.output_jsn = true; }
else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
else if (arg == "-pc" || arg == "--print-colors") { params.print_colors = true; }
fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
fprintf(stderr, " -oj, --output-json [%-7s] output result in a JSON file\n", params.output_jsn ? "true" : "false");
+ fprintf(stderr, " -ojf, --output-json-full [%-7s] include more information in the JSON file\n", params.output_jsn_full ? "true" : "false");
fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
fprintf(stderr, " -pc, --print-colors [%-7s] print colors\n", params.print_colors ? "true" : "false");
return true;
}
-bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params, std::vector<std::vector<float>> pcmf32s) {
+bool output_json(
+ struct whisper_context * ctx,
+ const char * fname,
+ const whisper_params & params,
+ std::vector<std::vector<float>> pcmf32s,
+ bool full) {
std::ofstream fout(fname);
int indent = 0;
auto end_arr = [&](bool end) {
indent--;
doindent();
- fout << (end ? "]\n" : "},\n");
+ fout << (end ? "]\n" : "],\n");
};
auto start_obj = [&](const char *name) {
end_value(end);
};
+ auto value_f = [&](const char *name, const float val, bool end) {
+ start_value(name);
+ fout << val;
+ end_value(end);
+ };
+
auto value_b = [&](const char *name, const bool val, bool end) {
start_value(name);
fout << (val ? "true" : "false");
end_value(end);
};
+ auto times_o = [&](int64_t t0, int64_t t1, bool end) {
+ start_obj("timestamps");
+ value_s("from", to_timestamp(t0, true).c_str(), false);
+ value_s("to", to_timestamp(t1, true).c_str(), true);
+ end_obj(false);
+ start_obj("offsets");
+ value_i("from", t0 * 10, false);
+ value_i("to", t1 * 10, true);
+ end_obj(end);
+ };
+
if (!fout.is_open()) {
fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
return false;
const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
start_obj(nullptr);
- start_obj("timestamps");
- value_s("from", to_timestamp(t0, true).c_str(), false);
- value_s("to", to_timestamp(t1, true).c_str(), true);
- end_obj(false);
- start_obj("offsets");
- value_i("from", t0 * 10, false);
- value_i("to", t1 * 10, true);
- end_obj(false);
- value_s("text", text, !params.diarize && !params.tinydiarize);
+ times_o(t0, t1, false);
+ value_s("text", text, !params.diarize && !params.tinydiarize && !full);
+
+ if (full) {
+ start_arr("tokens");
+ const int n = whisper_full_n_tokens(ctx, i);
+ for (int j = 0; j < n; ++j) {
+ auto token = whisper_full_get_token_data(ctx, i, j);
+ start_obj(nullptr);
+ value_s("text", whisper_token_to_str(ctx, token.id), false);
+ if(token.t0 > -1 && token.t1 > -1) {
+ // If we have per-token timestamps, write them out
+ times_o(token.t0, token.t1, false);
+ }
+ value_i("id", token.id, false);
+ value_f("p", token.p, true);
+ end_obj(j == (n - 1));
+ }
+ end_arr(!params.diarize && !params.tinydiarize);
+ }
if (params.diarize && pcmf32s.size() == 2) {
value_s("speaker", estimate_diarization_speaker(pcmf32s, t0, t1, true).c_str(), true);
wparams.offset_ms = params.offset_t_ms;
wparams.duration_ms = params.duration_ms;
- wparams.token_timestamps = params.output_wts || params.max_len > 0;
+ wparams.token_timestamps = params.output_wts || params.output_jsn_full || params.max_len > 0;
wparams.thold_pt = params.word_thold;
wparams.max_len = params.output_wts && params.max_len == 0 ? 60 : params.max_len;
wparams.split_on_word = params.split_on_word;
// output to JSON file
if (params.output_jsn) {
const auto fname_jsn = fname_out + ".json";
- output_json(ctx, fname_jsn.c_str(), params, pcmf32s);
+ output_json(ctx, fname_jsn.c_str(), params, pcmf32s, params.output_jsn_full);
}
// output to LRC file