main : provide option for creating JSON output (#615)

author Leo Moll <redacted>

Wed, 22 Mar 2023 19:37:36 +0000 (20:37 +0100)

committer GitHub <redacted>

Wed, 22 Mar 2023 19:37:36 +0000 (21:37 +0200)
author Leo Moll <redacted>
Wed, 22 Mar 2023 19:37:36 +0000 (20:37 +0100)
committer GitHub <redacted>
Wed, 22 Mar 2023 19:37:36 +0000 (21:37 +0200)
diff --git a/examples/main/README.md b/examples/main/README.md

index 2af2002880ab90e6af45ea61fee64c712b8fcdde..68a3e3b546ce022f2be279d0cb196d051c0a7471 100644 (file)
--- a/examples/main/README.md
+++ b/examples/main/README.md
@@ -31,6 +31,7 @@ options:
    -osrt,     --output-srt        [false  ] output result in a srt file\r
    -owts,     --output-words      [false  ] output script for generating karaoke video\r
    -ocsv,     --output-csv        [false  ] output result in a CSV file\r
+  -oj,       --output-json       [false  ] output result in a JSON file\r
    -of FNAME, --output-file FNAME [       ] output file path (without file extension)\r
    -ps,       --print-special     [false  ] print special tokens\r
    -pc,       --print-colors      [false  ] print colors\r
diff --git a/examples/main/main.cpp b/examples/main/main.cpp

index 4118989b232bf1424d63c3b5ab22dbcea856e51f..dd30ba4c473766169e10594283c1e33a66483891 100644 (file)
--- a/examples/main/main.cpp
+++ b/examples/main/main.cpp
@@ -73,6 +73,7 @@ struct whisper_params {
      bool output_srt     = false;
      bool output_wts     = false;
      bool output_csv     = false;
+    bool output_jsn     = false;
      bool print_special  = false;
      bool print_colors   = false;
      bool print_progress = false;
@@ -130,6 +131,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params) {
          else if (arg == "-owts" || arg == "--output-words")   { params.output_wts     = true; }
          else if (arg == "-fp"   || arg == "--font-path")      { params.font_path      = argv[++i]; }
          else if (arg == "-ocsv" || arg == "--output-csv")     { params.output_csv     = true; }
+        else if (arg == "-oj"   || arg == "--output-json")    { params.output_jsn     = true; }
          else if (arg == "-of"   || arg == "--output-file")    { params.fname_out.emplace_back(argv[++i]); }
          else if (arg == "-ps"   || arg == "--print-special")  { params.print_special  = true; }
          else if (arg == "-pc"   || arg == "--print-colors")   { params.print_colors   = true; }
@@ -178,6 +180,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
      fprintf(stderr, "  -owts,     --output-words      [%-7s] output script for generating karaoke video\n",     params.output_wts ? "true" : "false");
      fprintf(stderr, "  -fp,       --font-path         [%-7s] path to a monospace font for karaoke video\n",     params.font_path.c_str());
      fprintf(stderr, "  -ocsv,     --output-csv        [%-7s] output result in a CSV file\n",                    params.output_csv ? "true" : "false");
+    fprintf(stderr, "  -oj,       --output-json       [%-7s] output result in a JSON file\n",                   params.output_jsn ? "true" : "false");
      fprintf(stderr, "  -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n",      "");
      fprintf(stderr, "  -ps,       --print-special     [%-7s] print special tokens\n",                           params.print_special ? "true" : "false");
      fprintf(stderr, "  -pc,       --print-colors      [%-7s] print colors\n",                                   params.print_colors ? "true" : "false");
@@ -368,6 +371,129 @@ bool output_csv(struct whisper_context * ctx, const char * fname) {
      return true;
  }
  
+bool output_json(struct whisper_context * ctx, const char * fname, const whisper_params & params) {
+    std::ofstream fout(fname);
+    int indent = 0;
+
+    auto doindent = [&]() {
+        for (int i = 0; i < indent; i++) fout << "\t";
+    };
+
+    auto start_arr = [&](const char *name) {
+        doindent();
+        fout << "\"" << name << "\": [\n";
+        indent++;
+    };
+
+    auto end_arr = [&](bool end = false) {
+        indent--;
+        doindent();
+        fout << (end ? "]\n" : "},\n");
+    };
+
+    auto start_obj = [&](const char *name = nullptr) {
+        doindent();
+        if (name) {
+            fout << "\"" << name << "\": {\n";
+        } else {
+            fout << "{\n";
+        }
+        indent++;
+    };
+
+    auto end_obj = [&](bool end = false) {
+        indent--;
+        doindent();
+        fout << (end ? "}\n" : "},\n");
+    };
+
+    auto start_value = [&](const char *name) {
+        doindent();
+        fout << "\"" << name << "\": ";
+    };
+
+    auto value_s = [&](const char *name, const char *val, bool end = false) {
+        start_value(name);
+        fout << "\"" << val << (end ? "\"\n" : "\",\n");
+    };
+
+    auto end_value = [&](bool end = false) {
+        fout << (end ? "\n" : ",\n");
+    };
+
+    auto value_i = [&](const char *name, const int64_t val, bool end = false) {
+        start_value(name);
+        fout << val;
+        end_value(end);
+    };
+
+    auto value_b = [&](const char *name, const bool val, bool end = false) {
+        start_value(name);
+        fout << (val ? "true" : "false");
+        end_value(end);
+    };
+
+    if (!fout.is_open()) {
+        fprintf(stderr, "%s: failed to open '%s' for writing\n", __func__, fname);
+        return false;
+    }
+
+    fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
+    start_obj();
+        value_s("systeminfo", whisper_print_system_info());
+        start_obj("model");
+            value_s("type", whisper_model_type_readable(ctx));
+            value_b("multilingual", whisper_is_multilingual(ctx));
+            value_i("vocab", whisper_model_n_vocab(ctx));
+            start_obj("audio");
+                value_i("ctx", whisper_model_n_audio_ctx(ctx));
+                value_i("state", whisper_model_n_audio_state(ctx));
+                value_i("head", whisper_model_n_audio_head(ctx));
+                value_i("layer", whisper_model_n_audio_layer(ctx), true);
+            end_obj();
+            start_obj("text");
+                value_i("ctx", whisper_model_n_text_ctx(ctx));
+                value_i("state", whisper_model_n_text_state(ctx));
+                value_i("head", whisper_model_n_text_head(ctx));
+                value_i("leyer", whisper_model_n_text_layer(ctx), true);
+            end_obj();
+            value_i("mels", whisper_model_n_mels(ctx));
+            value_i("f16", whisper_model_f16(ctx), true);
+        end_obj();
+        start_obj("params");
+            value_s("model", params.model.c_str());
+            value_s("language", params.language.c_str());
+            value_b("translate", params.translate, true);
+        end_obj();
+        start_obj("result");
+            value_s("language", whisper_lang_str(whisper_full_lang_id(ctx)), true);
+        end_obj();
+        start_arr("transcription");
+
+            const int n_segments = whisper_full_n_segments(ctx);
+            for (int i = 0; i < n_segments; ++i) {
+                const char * text = whisper_full_get_segment_text(ctx, i);
+                const int64_t t0 = whisper_full_get_segment_t0(ctx, i);
+                const int64_t t1 = whisper_full_get_segment_t1(ctx, i);
+
+                start_obj();
+                    start_obj("timestanps");
+                        value_s("from", to_timestamp(t0, true).c_str());
+                        value_s("to", to_timestamp(t1, true).c_str(), true);
+                    end_obj();
+                    start_obj("offsets");
+                        value_i("from", t0 * 10);
+                        value_i("to", t1 * 10, true);
+                    end_obj();
+                    value_s("text", text, true);
+                end_obj(i == (n_segments - 1));
+            }
+
+        end_arr(true);
+    end_obj(true);
+    return true;
+}
+
  // karaoke video generation
  // outputs a bash script that uses ffmpeg to generate a video with the subtitles
  // TODO: font parameter adjustments
@@ -662,6 +788,12 @@ int main(int argc, char ** argv) {
                  const auto fname_csv = fname_out + ".csv";
                  output_csv(ctx, fname_csv.c_str());
              }
+
+            // output to JSON file
+            if (params.output_jsn) {
+                const auto fname_jsn = fname_out + ".json";
+                output_json(ctx, fname_jsn.c_str(), params);
+            }
          }
      }
  
diff --git a/whisper.cpp b/whisper.cpp

index bee1c258783b5a45cc7a62cd87f12ed4fa30c5dc..4d0245b6bbeb38ad80d8389f3951c8f4bdab02a3 100644 (file)
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -1408,7 +1408,7 @@ static bool whisper_encode_internal(
      //}
  
      static int iter = 0;
-    
+
      const size_t e_pe_stride = model.e_pe->ne[0]*ggml_element_size(model.e_pe);
      const size_t e_pe_offset = model.e_pe->ne[0]*ggml_element_size(model.e_pe)*n_ctx*iter;
  
@@ -2919,6 +2919,71 @@ int whisper_lang_auto_detect(
      return whisper_lang_auto_detect_with_state(ctx, ctx->state, offset_ms, n_threads, lang_probs);
  }
  
+int whisper_model_n_vocab(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_vocab;
+}
+
+int whisper_model_n_audio_ctx(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_audio_ctx;
+}
+
+int whisper_model_n_audio_state(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_audio_state;
+}
+
+int whisper_model_n_audio_head(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_audio_head;
+}
+
+int whisper_model_n_audio_layer(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_audio_layer;
+}
+
+int whisper_model_n_text_ctx(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_text_ctx;
+}
+
+int whisper_model_n_text_state(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_text_state;
+}
+
+int whisper_model_n_text_head(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_text_head;
+}
+
+int whisper_model_n_text_layer(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_text_layer;
+}
+
+int whisper_model_n_mels(struct whisper_context * ctx) {
+    return ctx->model.hparams.n_mels;
+}
+
+int whisper_model_f16(struct whisper_context * ctx) {
+    return ctx->model.hparams.f16;
+}
+
+int whisper_model_type(struct whisper_context * ctx) {
+    return ctx->model.type;
+}
+
+const char *whisper_model_type_readable(struct whisper_context * ctx) {
+    switch (ctx->model.type) {
+    case e_model::MODEL_TINY:
+        return "tiny";
+    case e_model::MODEL_BASE:
+        return "base";
+    case e_model::MODEL_SMALL:
+        return "small";
+    case e_model::MODEL_MEDIUM:
+        return "medium";
+    case e_model::MODEL_LARGE:
+        return "large";
+    default:
+        return "unknown";
+    }
+}
+
  int whisper_n_len_from_state(struct whisper_state * state) {
      return state->mel.n_len;
  }
diff --git a/whisper.h b/whisper.h

index 0a8270db941d0871023050aaeab8ee2922e90f52..fc107108ad810da73e2fc4d8a0339e73bccf749e 100644 (file)
--- a/whisper.h
+++ b/whisper.h
@@ -248,6 +248,19 @@ extern "C" {
      WHISPER_API int whisper_n_audio_ctx     (struct whisper_context * ctx);
      WHISPER_API int whisper_is_multilingual (struct whisper_context * ctx);
  
+    WHISPER_API int whisper_model_n_vocab      (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_audio_ctx  (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_audio_state(struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_audio_head (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_audio_layer(struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_text_ctx   (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_text_state (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_text_head  (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_text_layer (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_n_mels       (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_f16          (struct whisper_context * ctx);
+    WHISPER_API int whisper_model_type         (struct whisper_context * ctx);
+
      // Token logits obtained from the last call to whisper_decode()
      // The logits for the last token are stored in the last row
      // Rows: n_tokens
@@ -257,6 +270,8 @@ extern "C" {
  
      // Token Id -> String. Uses the vocabulary in the provided context
      WHISPER_API const char * whisper_token_to_str(struct whisper_context * ctx, whisper_token token);
+    WHISPER_API const char * whisper_model_type_readable(struct whisper_context * ctx);
+
  
      // Special tokens
      WHISPER_API whisper_token whisper_token_eot (struct whisper_context * ctx);
author	Leo Moll <redacted>
	Wed, 22 Mar 2023 19:37:36 +0000 (20:37 +0100)
committer	GitHub <redacted>
	Wed, 22 Mar 2023 19:37:36 +0000 (21:37 +0200)
examples/main/README.md		patch \| blob \| history
examples/main/main.cpp		patch \| blob \| history
whisper.cpp		patch \| blob \| history
whisper.h		patch \| blob \| history