std::string language = "en";
std::string prompt;
+ std::string font_path = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
std::string model = "models/ggml-base.en.bin";
std::vector<std::string> fname_inp = {};
else if (arg == "-ovtt" || arg == "--output-vtt") { params.output_vtt = true; }
else if (arg == "-osrt" || arg == "--output-srt") { params.output_srt = true; }
else if (arg == "-owts" || arg == "--output-words") { params.output_wts = true; }
+ else if (arg == "-fp" || arg == "--font-path") { params.font_path = argv[++i]; }
else if (arg == "-ocsv" || arg == "--output-csv") { params.output_csv = true; }
else if (arg == "-of" || arg == "--output-file") { params.fname_out.emplace_back(argv[++i]); }
else if (arg == "-ps" || arg == "--print-special") { params.print_special = true; }
fprintf(stderr, " -ovtt, --output-vtt [%-7s] output result in a vtt file\n", params.output_vtt ? "true" : "false");
fprintf(stderr, " -osrt, --output-srt [%-7s] output result in a srt file\n", params.output_srt ? "true" : "false");
fprintf(stderr, " -owts, --output-words [%-7s] output script for generating karaoke video\n", params.output_wts ? "true" : "false");
+ fprintf(stderr, " -fp, --font-path [%-7s] path to a monospace font for karaoke video\n", params.font_path.c_str());
fprintf(stderr, " -ocsv, --output-csv [%-7s] output result in a CSV file\n", params.output_csv ? "true" : "false");
fprintf(stderr, " -of FNAME, --output-file FNAME [%-7s] output file path (without file extension)\n", "");
fprintf(stderr, " -ps, --print-special [%-7s] print special tokens\n", params.print_special ? "true" : "false");
// karaoke video generation
// outputs a bash script that uses ffmpeg to generate a video with the subtitles
// TODO: font parameter adjustments
-bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & /*params*/, float t_sec) {
+bool output_wts(struct whisper_context * ctx, const char * fname, const char * fname_inp, const whisper_params & params, float t_sec) {
std::ofstream fout(fname);
fprintf(stderr, "%s: saving output to '%s'\n", __func__, fname);
- // TODO: become parameter
- static const char * font = "/System/Library/Fonts/Supplemental/Courier New Bold.ttf";
+ static const char * font = params.font_path.c_str();
+
+ std::ifstream fin(font);
+ if (!fin.is_open()) {
+ fprintf(stderr, "%s: font not found at '%s', please specify a monospace font with -fp\n", __func__, font);
+ return false;
+ }
fout << "#!/bin/bash" << "\n";
fout << "\n";
--- /dev/null
+# This script takes two arguments
+# - an audio file
+# - [optional] path to a font file
+
+# I'm using "/usr/share/fonts/truetype/freefont/FreeMono.ttf" on Ubuntu
+
+if [ -z "$1" ]; then
+ echo "Usage: $0 <audio file> [font file]"
+ exit 1
+fi
+
+#TODO: Make this a command line parameter
+models="base small large"
+#models="tiny.en tiny base.en base small.en small medium.en medium large-v1 large"
+
+DURATION=$(ffprobe -i $1 -show_entries format=duration -v quiet -of csv="p=0")
+DURATION=$(printf "%.2f" $DURATION)
+echo "Input file duration: ${DURATION}s"
+
+for model in $models; do
+ echo "Running $model"
+ COMMAND="./main -m models/ggml-$model.bin -owts -f $1 -of $1.$model"
+
+ if [ ! -z "$2" ]; then
+ COMMAND="$COMMAND -fp $2"
+ fi
+ #TODO: Surface errors better
+ # TIMEFMT is for zsh, TIMEFORMAT is for bash
+ EXECTIME=$({ TIMEFMT="%E";TIMEFORMAT=%E; time $COMMAND >/dev/null 2>&1; } 2>&1)
+
+ # Slightly different formats between zsh and bash
+ if [ "${EXECTIME: -1}" == "s" ]; then
+ EXECTIME=${EXECTIME::-1}
+ fi
+
+ RATIO=$(echo "$DURATION / $EXECTIME" | bc -l)
+ RATIO=$(printf "%.2f" $RATIO)
+
+ echo "Execution time: ${EXECTIME}s (${RATIO}x realtime)"
+
+ # If the file already exists, delete it
+ if [ -f $1.mp4 ]; then
+ rm $1.$model.mp4
+ fi
+
+ bash $1.$model.wts >/dev/null 2>&1
+ mv $1.mp4 $1.$model.mp4
+
+ ffmpeg -y -f lavfi -i color=c=black:s=1200x50:d=$DURATION -vf "drawtext=fontfile=$2:fontsize=36:x=10:y=(h-text_h)/2:text='ggml-$model - ${EXECTIME}s (${RATIO}x realtime)':fontcolor=lightgrey" $1.$model.info.mp4 >/dev/null 2>&1
+done
+
+COMMAND="ffmpeg -y"
+for model in $models; do
+ COMMAND="$COMMAND -i $1.$model.info.mp4 -i $1.$model.mp4"
+done
+COMMAND="$COMMAND -filter_complex \""
+COUNT=0
+for model in $models; do
+ COMMAND="$COMMAND[${COUNT}:v][$(($COUNT+1)):v]"
+ COUNT=$((COUNT+2))
+done
+COMMAND="$COMMAND vstack=inputs=${COUNT}[v]\" -map \"[v]\" -map 1:a $1.all.mp4 >/dev/null 2>&1"
+
+echo $COMMAND
+
+# Run the command
+eval $COMMAND
\ No newline at end of file