parallel : print time of audio boundaries + fix timings

author Georgi Gerganov <redacted>

Sat, 29 Oct 2022 11:08:23 +0000 (14:08 +0300)

committer Georgi Gerganov <redacted>

Sat, 29 Oct 2022 16:37:19 +0000 (19:37 +0300)
author Georgi Gerganov <redacted>
Sat, 29 Oct 2022 11:08:23 +0000 (14:08 +0300)
committer Georgi Gerganov <redacted>
Sat, 29 Oct 2022 16:37:19 +0000 (19:37 +0300)
diff --git a/whisper.cpp b/whisper.cpp

index ff8e1b0e79c1c67c107527e441bbe4cdc10b6b57..235f5592fe40ae070f3a8b6ff4eb8a1033ff361a 100644 (file)
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -1910,14 +1910,19 @@ whisper_vocab::id whisper_sample_timestamp(
      return probs_id[0].second;
  }
  
-static std::string to_timestamp(int64_t t) {
-    int64_t sec = t/100;
-    int64_t msec = t - sec*100;
-    int64_t min = sec/60;
-    sec = sec - min*60;
+//  500 -> 00:05.000
+// 6000 -> 01:00.000
+std::string to_timestamp(int64_t t, bool comma = false) {
+    int64_t msec = t * 10;
+    int64_t hr = msec / (1000 * 60 * 60);
+    msec = msec - hr * (1000 * 60 * 60);
+    int64_t min = msec / (1000 * 60);
+    msec = msec - min * (1000 * 60);
+    int64_t sec = msec / 1000;
+    msec = msec - sec * 1000;
  
      char buf[32];
-    snprintf(buf, sizeof(buf), "%02d:%02d.%03d", (int) min, (int) sec, (int) msec);
+    snprintf(buf, sizeof(buf), "%02d:%02d:%02d%s%03d", (int) hr, (int) min, (int) sec, comma ? "," : ".", (int) msec);
  
      return std::string(buf);
  }
@@ -2727,24 +2732,45 @@ int whisper_full_parallel(
  
      // combine results into ctx->result_all
      for (int i = 0; i < n_processors - 1; ++i) {
-        auto & result_all = ctxs[i].result_all;
+        auto & results_i = ctxs[i].result_all;
  
-        for (int j = 0; j < (int) result_all.size(); ++j) {
-            result_all[j].t0 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
-            result_all[j].t1 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
+        for (int j = 0; j < (int) results_i.size(); ++j) {
+            // correct the segment timestamp taking into account the offset
+            results_i[j].t0 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
+            results_i[j].t1 += 100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t;
  
+            // make sure that segments are not overlapping
              if (ctx->result_all.size() > 0) {
-                result_all[j].t0 = std::max(result_all[j].t0, ctx->result_all.back().t1);
+                results_i[j].t0 = std::max(results_i[j].t0, ctx->result_all.back().t1);
              }
  
-            ctx->result_all.push_back(std::move(result_all[j]));
+            ctx->result_all.push_back(std::move(results_i[j]));
  
              // call the new_segment_callback for each segment
              if (params.new_segment_callback) {
                  params.new_segment_callback(ctx, params.new_segment_callback_user_data);
              }
          }
+
+        ctx->t_mel_us    += ctxs[i].t_mel_us;
+        ctx->t_sample_us += ctxs[i].t_sample_us;
+        ctx->t_encode_us += ctxs[i].t_encode_us;
+        ctx->t_decode_us += ctxs[i].t_decode_us;
+    }
+
+    // average the timings
+    ctx->t_mel_us    /= n_processors;
+    ctx->t_sample_us /= n_processors;
+    ctx->t_encode_us /= n_processors;
+    ctx->t_decode_us /= n_processors;
+
+    // print information about the audio boundaries
+    fprintf(stderr, "\n");
+    fprintf(stderr, "%s: the audio has been split into %d chunks at the following times:\n", __func__, n_processors);
+    for (int i = 0; i < n_processors - 1; ++i) {
+        fprintf(stderr, "%s: split %d - %s\n", __func__, (i + 1), to_timestamp(100*((i + 1)*n_samples_per_processor)/WHISPER_SAMPLE_RATE + offset_t).c_str());
      }
+    fprintf(stderr, "%s: the transcription quality may be degraded near these boundaries\n", __func__);
  
      return ret;
  }
diff --git a/whisper.h b/whisper.h

index 0016d0d0b664226fcaacf97aeda1f1d92f80c669..cb2d74705a813e3ab6af53c8f8ec439ff77581d2 100644 (file)
--- a/whisper.h
+++ b/whisper.h
@@ -213,6 +213,9 @@ extern "C" {
              const float * samples,
              int n_samples);
  
+    // Split the input audio in chunks and process each chunk separately using whisper_full()
+    // It seems this approach can offer some speedup in some cases.
+    // However, the transcription accuracy can be worse at the beginning and end of each chunk.
      WHISPER_API int whisper_full_parallel(
              struct whisper_context * ctx,
              struct whisper_full_params params,
author	Georgi Gerganov <redacted>
	Sat, 29 Oct 2022 11:08:23 +0000 (14:08 +0300)
committer	Georgi Gerganov <redacted>
	Sat, 29 Oct 2022 16:37:19 +0000 (19:37 +0300)
whisper.cpp		patch \| blob \| history
whisper.h		patch \| blob \| history