return probs_id[0].second;
}
-static std::string to_timestamp(int64_t t) {
- int64_t sec = t/100;
- int64_t msec = t - sec*100;
- int64_t min = sec/60;
- sec = sec - min*60;
+// 500 -> 00:05.000
+// 6000 -> 01:00.000
+std::string to_timestamp(int64_t t, bool comma = false) {
+ int64_t msec = t * 10;
+ int64_t hr = msec / (1000 * 60 * 60);
+ msec = msec - hr * (1000 * 60 * 60);
+ int64_t min = msec / (1000 * 60);
+ msec = msec - min * (1000 * 60);
+ int64_t sec = msec / 1000;
+ msec = msec - sec * 1000;
char buf[32];
- snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
+ snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
return std::string(buf);
}
// combine results into ctx->result_all
for (int i = 0; i < n_processors - 1; ++i) {
- auto & result_all = ctxs[i].result_all;
+ auto & results_i = ctxs[i].result_all;
- for (int j = 0; j < (int) result_all.size(); ++j) {
- result_all[j].t0 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
- result_all[j].t1 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
+ for (int j = 0; j < (int) results_i.size(); ++j) {
+ // correct the segment timestamp taking into account the offset
+ results_i[j].t0 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
+ results_i[j].t1 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
+ // make sure that segments are not overlapping
if (ctx->result_all.size() > 0) {
- result_all[j].t0 = std::max(result_all[j].t0, ctx->result_all.back().t1);
+ results_i[j].t0 = std::max(results_i[j].t0, ctx->result_all.back().t1);
}
- ctx->result_all.push_back(std::move(result_all[j]));
+ ctx->result_all.push_back(std::move(results_i[j]));
// call the new_segment_callback for each segment
if (params.new_segment_callback) {
params.new_segment_callback(ctx, params.new_segment_callback_user_data);
}
}
+
+ ctx->t_mel_us += ctxs[i].t_mel_us;
+ ctx->t_sample_us += ctxs[i].t_sample_us;
+ ctx->t_encode_us += ctxs[i].t_encode_us;
+ ctx->t_decode_us += ctxs[i].t_decode_us;
+ }
+
+ // average the timings
+ ctx->t_mel_us /= n_processors;
+ ctx->t_sample_us /= n_processors;
+ ctx->t_encode_us /= n_processors;
+ ctx->t_decode_us /= n_processors;
+
+ // print information about the audio boundaries
+ fprintf(stderr, "\n");
+ fprintf(stderr, "%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors);
+ for (int i = 0; i < n_processors - 1; ++i) {
+ fprintf(stderr, "%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str());
}
+ fprintf(stderr, "%s: the transcription quality may be degraded near these boundaries\n", __func__);
return ret;
}