--- /dev/null
+# From
+# https://github.com/snikulov/cmake-modules/blob/master/FindFFmpeg.cmake
+#
+# vim: ts=2 sw=2
+# - Try to find the required ffmpeg components(default: AVFORMAT, AVUTIL, AVCODEC)
+#
+# Once done this will define
+# FFMPEG_FOUND - System has the all required components.
+# FFMPEG_INCLUDE_DIRS - Include directory necessary for using the required components headers.
+# FFMPEG_LIBRARIES - Link these to use the required ffmpeg components.
+# FFMPEG_DEFINITIONS - Compiler switches required for using the required ffmpeg components.
+#
+# For each of the components it will additionally set.
+# - AVCODEC
+# - AVDEVICE
+# - AVFORMAT
+# - AVFILTER
+# - AVUTIL
+# - POSTPROC
+# - SWSCALE
+# the following variables will be defined
+# <component>_FOUND - System has <component>
+# <component>_INCLUDE_DIRS - Include directory necessary for using the <component> headers
+# <component>_LIBRARIES - Link these to use <component>
+# <component>_DEFINITIONS - Compiler switches required for using <component>
+# <component>_VERSION - The components version
+#
+# Copyright (c) 2006, Matthias Kretz, <kretz@kde.org>
+# Copyright (c) 2008, Alexander Neundorf, <neundorf@kde.org>
+# Copyright (c) 2011, Michael Jansen, <kde@michael-jansen.biz>
+#
+# Redistribution and use is allowed according to the terms of the BSD license.
+# For details see the accompanying COPYING-CMAKE-SCRIPTS file.
+
+include(FindPackageHandleStandardArgs)
+
+# The default components were taken from a survey over other FindFFMPEG.cmake files
+if (NOT FFmpeg_FIND_COMPONENTS)
+ set(FFmpeg_FIND_COMPONENTS AVFORMAT AVCODEC AVUTIL SWRESAMPLE)
+endif()
+
+#
+### Macro: set_component_found
+#
+# Marks the given component as found if both *_LIBRARIES AND *_INCLUDE_DIRS is present.
+#
+macro(set_component_found _component )
+ if (${_component}_LIBRARIES AND ${_component}_INCLUDE_DIRS)
+ message(DEBUG " - ${_component} found.")
+ set(${_component}_FOUND TRUE)
+ else ()
+ message(DEBUG " - ${_component} not found.")
+ endif ()
+endmacro()
+
+#
+### Macro: find_component
+#
+# Checks for the given component by invoking pkgconfig and then looking up the libraries and
+# include directories.
+#
+macro(find_component _component _pkgconfig _library _header)
+
+ if (NOT WIN32)
+ # use pkg-config to get the directories and then use these values
+ # in the FIND_PATH() and FIND_LIBRARY() calls
+ find_package(PkgConfig)
+ if (PKG_CONFIG_FOUND)
+ pkg_check_modules(PC_${_component} ${_pkgconfig})
+ message(STATUS "Pkgconfig found: ${PC_${_component}_INCLUDEDIR}")
+ message(STATUS "Pkgconfig found: ${PC_${_component}_INCLUDE_DIRS}")
+ message(STATUS "${PC_${_component}_CFLAGS}")
+ endif ()
+ endif (NOT WIN32)
+
+
+ find_path(${_component}_INCLUDE_DIRS ${_header}
+ HINTS
+ ${PC_${_component}_INCLUDEDIR}
+ ${PC_${_component}_INCLUDE_DIRS}
+ PATH_SUFFIXES
+ ffmpeg
+ )
+
+ # CMake's default is to search first for shared libraries and then for static libraries.
+ # Todo later: add option to prefer static libs over dynamic:
+ find_library(${_component}_LIBRARIES NAMES ${_library} lib${_library}.a
+ HINTS
+ ${PC_${_component}_LIBDIR}
+ ${PC_${_component}_LIBRARY_DIRS}
+ )
+
+ set(${_component}_DEFINITIONS ${PC_${_component}_CFLAGS_OTHER} CACHE STRING "The ${_component} CFLAGS.")
+ set(${_component}_VERSION ${PC_${_component}_VERSION} CACHE STRING "The ${_component} version number.")
+
+ set_component_found(${_component})
+
+ mark_as_advanced(
+ ${_component}_INCLUDE_DIRS
+ ${_component}_LIBRARIES
+ ${_component}_DEFINITIONS
+ ${_component}_VERSION)
+
+endmacro()
+
+
+# Check for cached results. If there are skip the costly part.
+if (NOT FFMPEG_LIBRARIES)
+
+ # Check for all possible component.
+ find_component(AVCODEC libavcodec avcodec libavcodec/avcodec.h)
+ find_component(AVFORMAT libavformat avformat libavformat/avformat.h)
+ find_component(AVDEVICE libavdevice avdevice libavdevice/avdevice.h)
+ #find_component(AVRESAMPLE libavresample avresample libavresample/avresample.h) # old name for swresample
+ find_component(AVUTIL libavutil avutil libavutil/avutil.h)
+ find_component(AVFILTER libavfilter avfilter libavfilter/avfilter.h)
+ find_component(SWSCALE libswscale swscale libswscale/swscale.h)
+ find_component(POSTPROC libpostproc postproc libpostproc/postprocess.h)
+ find_component(SWRESAMPLE libswresample swresample libswresample/swresample.h)
+
+ # Check if the required components were found and add their stuff to the FFMPEG_* vars.
+ foreach (_component ${FFmpeg_FIND_COMPONENTS})
+ if (${_component}_FOUND)
+ # message(STATUS "Required component ${_component} present.")
+ set(FFMPEG_LIBRARIES ${FFMPEG_LIBRARIES} ${${_component}_LIBRARIES})
+ set(FFMPEG_DEFINITIONS ${FFMPEG_DEFINITIONS} ${${_component}_DEFINITIONS})
+ list(APPEND FFMPEG_INCLUDE_DIRS ${${_component}_INCLUDE_DIRS})
+ else ()
+ # message(STATUS "Required component ${_component} missing.")
+ endif ()
+ endforeach ()
+
+ # Build the include path with duplicates removed.
+ if (FFMPEG_INCLUDE_DIRS)
+ list(REMOVE_DUPLICATES FFMPEG_INCLUDE_DIRS)
+ endif ()
+
+ # cache the vars.
+ set(FFMPEG_INCLUDE_DIRS ${FFMPEG_INCLUDE_DIRS} CACHE STRING "The FFmpeg include directories." FORCE)
+ set(FFMPEG_LIBRARIES ${FFMPEG_LIBRARIES} CACHE STRING "The FFmpeg libraries." FORCE)
+ set(FFMPEG_DEFINITIONS ${FFMPEG_DEFINITIONS} CACHE STRING "The FFmpeg cflags." FORCE)
+
+ mark_as_advanced(FFMPEG_INCLUDE_DIRS
+ FFMPEG_LIBRARIES
+ FFMPEG_DEFINITIONS)
+
+endif ()
+
+# Now set the noncached _FOUND vars for the components.
+# whisper.cpp does not need SWSCALE
+foreach (_component AVCODEC AVDEVICE AVFORMAT AVRESAMPLE AVUTIL POSTPROCESS)
+ set_component_found(${_component})
+endforeach ()
+
+# Compile the list of required vars
+set(_FFmpeg_REQUIRED_VARS FFMPEG_LIBRARIES FFMPEG_INCLUDE_DIRS)
+foreach (_component ${FFmpeg_FIND_COMPONENTS})
+ list(APPEND _FFmpeg_REQUIRED_VARS ${_component}_LIBRARIES ${_component}_INCLUDE_DIRS)
+endforeach ()
+
+# Give a nice error message if some of the required vars are missing.
+find_package_handle_standard_args(FFmpeg DEFAULT_MSG ${_FFmpeg_REQUIRED_VARS})
+
--- /dev/null
+/* SPDX-License-Identifier: GPL-2.0 */
+
+/*
+ * transcode.c - convert audio file to WAVE
+ *
+ * Copyright (C) 2019 Andrew Clayton <andrew@digital-domain.net>
+ * Copyright (C) 2024 William Tambellini <william.tambellini@gmail.com>
+ */
+
+// Just for conveninent C++ API
+#include <vector>
+#include <string>
+
+// C
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <stdbool.h>
+#include <stdint.h>
+#include <sys/types.h>
+#include <sys/stat.h>
+#include <fcntl.h>
+#include <unistd.h>
+#include <sys/mman.h>
+
+extern "C" {
+#include <libavutil/opt.h>
+#include <libavcodec/avcodec.h>
+#include <libavformat/avformat.h>
+#include <libswresample/swresample.h>
+}
+
+typedef uint64_t u64;
+typedef int64_t s64;
+typedef uint32_t u32;
+typedef int32_t s32;
+typedef uint16_t u16;
+typedef int16_t s16;
+typedef uint8_t u8;
+typedef int8_t s8;
+
+#define WAVE_SAMPLE_RATE 16000
+#define AVIO_CTX_BUF_SZ 4096
+
+static const char* ffmpegLog = getenv("FFMPEG_LOG");
+// Todo: add __FILE__ __LINE__
+#define LOG(...) \
+ do { if (ffmpegLog) fprintf(stderr, __VA_ARGS__); } while(0) // C99
+
+/*
+ * WAVE file header based on definition from
+ * https://gist.github.com/Jon-Schneider/8b7c53d27a7a13346a643dac9c19d34f
+ *
+ * We must ensure this structure doesn't have any holes or
+ * padding so we can just map it straight to the WAVE data.
+ */
+struct wave_hdr {
+ /* RIFF Header: "RIFF" */
+ char riff_header[4];
+ /* size of audio data + sizeof(struct wave_hdr) - 8 */
+ int wav_size;
+ /* "WAVE" */
+ char wav_header[4];
+
+ /* Format Header */
+ /* "fmt " (includes trailing space) */
+ char fmt_header[4];
+ /* Should be 16 for PCM */
+ int fmt_chunk_size;
+ /* Should be 1 for PCM. 3 for IEEE Float */
+ s16 audio_format;
+ s16 num_channels;
+ int sample_rate;
+ /*
+ * Number of bytes per second
+ * sample_rate * num_channels * bit_depth/8
+ */
+ int byte_rate;
+ /* num_channels * bytes per sample */
+ s16 sample_alignment;
+ /* bits per sample */
+ s16 bit_depth;
+
+ /* Data Header */
+ /* "data" */
+ char data_header[4];
+ /*
+ * size of audio
+ * number of samples * num_channels * bit_depth/8
+ */
+ int data_bytes;
+} __attribute__((__packed__));
+
+struct audio_buffer {
+ u8 *ptr;
+ int size; /* size left in the buffer */
+};
+
+static void set_wave_hdr(wave_hdr& wh, size_t size) {
+ memcpy(&wh.riff_header, "RIFF", 4);
+ wh.wav_size = size + sizeof(struct wave_hdr) - 8;
+ memcpy(&wh.wav_header, "WAVE", 4);
+ memcpy(&wh.fmt_header, "fmt ", 4);
+ wh.fmt_chunk_size = 16;
+ wh.audio_format = 1;
+ wh.num_channels = 1;
+ wh.sample_rate = WAVE_SAMPLE_RATE;
+ wh.sample_alignment = 2;
+ wh.bit_depth = 16;
+ wh.byte_rate = wh.sample_rate * wh.sample_alignment;
+ memcpy(&wh.data_header, "data", 4);
+ wh.data_bytes = size;
+}
+
+static void write_wave_hdr(int fd, size_t size) {
+ struct wave_hdr wh;
+ set_wave_hdr(wh, size);
+ write(fd, &wh, sizeof(struct wave_hdr));
+}
+
+static int map_file(int fd, u8 **ptr, size_t *size)
+{
+ struct stat sb;
+
+ fstat(fd, &sb);
+ *size = sb.st_size;
+
+ *ptr = (u8*)mmap(NULL, *size, PROT_READ|PROT_WRITE, MAP_PRIVATE, fd, 0);
+ if (*ptr == MAP_FAILED) {
+ perror("mmap");
+ return -1;
+ }
+
+ return 0;
+}
+
+static int read_packet(void *opaque, u8 *buf, int buf_size)
+{
+ struct audio_buffer *audio_buf = (audio_buffer*)opaque;
+
+ buf_size = FFMIN(buf_size, audio_buf->size);
+
+ /* copy internal buffer data to buf */
+ memcpy(buf, audio_buf->ptr, buf_size);
+ audio_buf->ptr += buf_size;
+ audio_buf->size -= buf_size;
+
+ return buf_size;
+}
+
+static void convert_frame(struct SwrContext *swr, AVCodecContext *codec,
+ AVFrame *frame, s16 **data, int *size, bool flush)
+{
+ int nr_samples;
+ s64 delay;
+ u8 *buffer;
+
+ delay = swr_get_delay(swr, codec->sample_rate);
+ nr_samples = av_rescale_rnd(delay + frame->nb_samples,
+ WAVE_SAMPLE_RATE, codec->sample_rate,
+ AV_ROUND_UP);
+ av_samples_alloc(&buffer, NULL, 1, nr_samples, AV_SAMPLE_FMT_S16, 0);
+
+ /*
+ * !flush is used to check if we are flushing any remaining
+ * conversion buffers...
+ */
+ nr_samples = swr_convert(swr, &buffer, nr_samples,
+ !flush ? (const u8 **)frame->data : NULL,
+ !flush ? frame->nb_samples : 0);
+
+ *data = (s16*)realloc(*data, (*size + nr_samples) * sizeof(s16));
+ memcpy(*data + *size, buffer, nr_samples * sizeof(s16));
+ *size += nr_samples;
+ av_freep(&buffer);
+}
+
+static bool is_audio_stream(const AVStream *stream)
+{
+ if (stream->codecpar->codec_type == AVMEDIA_TYPE_AUDIO)
+ return true;
+
+ return false;
+}
+
+// Return non zero on error, 0 on success
+// audio_buffer: input memory
+// data: decoded output audio data (wav file)
+// size: size of output data
+static int decode_audio(struct audio_buffer *audio_buf, s16 **data, int *size)
+{
+ LOG("decode_audio: input size: %d\n", audio_buf->size);
+ AVFormatContext *fmt_ctx;
+ AVIOContext *avio_ctx;
+ AVStream *stream;
+ AVCodecContext *codec;
+ AVPacket packet;
+ AVFrame *frame;
+ struct SwrContext *swr;
+ u8 *avio_ctx_buffer;
+ unsigned int i;
+ int stream_index = -1;
+ int err;
+ const size_t errbuffsize = 1024;
+ char errbuff[errbuffsize];
+
+ av_register_all(); // from avformat. Still a must-have call for ffmpeg v3! (can be skipped for later versions)
+
+ fmt_ctx = avformat_alloc_context();
+ avio_ctx_buffer = (u8*)av_malloc(AVIO_CTX_BUF_SZ);
+ LOG("Creating an avio context: AVIO_CTX_BUF_SZ=%d\n", AVIO_CTX_BUF_SZ);
+ avio_ctx = avio_alloc_context(avio_ctx_buffer, AVIO_CTX_BUF_SZ, 0, audio_buf, &read_packet, NULL, NULL);
+ fmt_ctx->pb = avio_ctx;
+
+ // open the input stream and read header
+ err = avformat_open_input(&fmt_ctx, NULL, NULL, NULL);
+ if (err) {
+ LOG("Could not read audio buffer: %d: %s\n", err, av_make_error_string(errbuff, errbuffsize, err));
+ return err;
+ }
+
+ err = avformat_find_stream_info(fmt_ctx, NULL);
+ if (err < 0) {
+ LOG("Could not retrieve stream info from audio buffer: %d\n", err);
+ return err;
+ }
+
+ for (i = 0; i < fmt_ctx->nb_streams; i++) {
+ if (is_audio_stream(fmt_ctx->streams[i])) {
+ stream_index = i;
+ break;
+ }
+ }
+
+ if (stream_index == -1) {
+ LOG("Could not retrieve audio stream from buffer\n");
+ return -1;
+ }
+
+ stream = fmt_ctx->streams[stream_index];
+ codec = avcodec_alloc_context3(
+ avcodec_find_decoder(stream->codecpar->codec_id));
+ avcodec_parameters_to_context(codec, stream->codecpar);
+ err = avcodec_open2(codec, avcodec_find_decoder(codec->codec_id),
+ NULL);
+ if (err) {
+ LOG("Failed to open decoder for stream #%d in audio buffer\n", stream_index);
+ return err;
+ }
+
+ /* prepare resampler */
+ swr = swr_alloc();
+
+ av_opt_set_int(swr, "in_channel_count", codec->channels, 0);
+ av_opt_set_int(swr, "out_channel_count", 1, 0);
+ av_opt_set_int(swr, "in_channel_layout", codec->channel_layout, 0);
+ av_opt_set_int(swr, "out_channel_layout", AV_CH_LAYOUT_MONO, 0);
+ av_opt_set_int(swr, "in_sample_rate", codec->sample_rate, 0);
+ av_opt_set_int(swr, "out_sample_rate", WAVE_SAMPLE_RATE, 0);
+ av_opt_set_sample_fmt(swr, "in_sample_fmt", codec->sample_fmt, 0);
+ av_opt_set_sample_fmt(swr, "out_sample_fmt", AV_SAMPLE_FMT_S16, 0);
+
+ swr_init(swr);
+ if (!swr_is_initialized(swr)) {
+ LOG("Resampler has not been properly initialized\n");
+ return -1;
+ }
+
+ av_init_packet(&packet);
+ frame = av_frame_alloc();
+ if (!frame) {
+ LOG("Error allocating the frame\n");
+ return -1;
+ }
+
+ /* iterate through frames */
+ *data = NULL;
+ *size = 0;
+ while (av_read_frame(fmt_ctx, &packet) >= 0) {
+ avcodec_send_packet(codec, &packet);
+
+ err = avcodec_receive_frame(codec, frame);
+ if (err == AVERROR(EAGAIN))
+ continue;
+
+ convert_frame(swr, codec, frame, data, size, false);
+ }
+ /* Flush any remaining conversion buffers... */
+ convert_frame(swr, codec, frame, data, size, true);
+
+ av_frame_free(&frame);
+ swr_free(&swr);
+ //avio_context_free(); // todo?
+ avcodec_close(codec);
+ avformat_close_input(&fmt_ctx);
+ avformat_free_context(fmt_ctx);
+
+ if (avio_ctx) {
+ av_freep(&avio_ctx->buffer);
+ av_freep(&avio_ctx);
+ }
+
+ return 0;
+}
+
+// in mem decoding/conversion/resampling:
+// ifname: input file path
+// owav_data: in mem wav file. Can be forwarded as it to whisper/drwav
+// return 0 on success
+int ffmpeg_decode_audio(const std::string &ifname, std::vector<uint8_t>& owav_data) {
+ LOG("ffmpeg_decode_audio: %s\n", ifname.c_str());
+ int ifd = open(ifname.c_str(), O_RDONLY);
+ if (ifd == -1) {
+ fprintf(stderr, "Couldn't open input file %s\n", ifname.c_str());
+ return -1;
+ }
+ u8 *ibuf = NULL;
+ size_t ibuf_size;
+ int err = map_file(ifd, &ibuf, &ibuf_size);
+ if (err) {
+ LOG("Couldn't map input file %s\n", ifname.c_str());
+ return err;
+ }
+ LOG("Mapped input file: %x size: %d\n", ibuf, ibuf_size);
+ struct audio_buffer inaudio_buf;
+ inaudio_buf.ptr = ibuf;
+ inaudio_buf.size = ibuf_size;
+
+ s16 *odata=NULL;
+ int osize=0;
+
+ err = decode_audio(&inaudio_buf, &odata, &osize);
+ LOG("decode_audio returned %d \n", err);
+ if (err != 0) {
+ LOG("decode_audio failed\n");
+ return err;
+ }
+ LOG("decode_audio output size: %d\n", osize);
+
+ wave_hdr wh;
+ const size_t outdatasize = osize * sizeof(s16);
+ set_wave_hdr(wh, outdatasize);
+ owav_data.resize(sizeof(wave_hdr) + outdatasize);
+ // header:
+ memcpy(owav_data.data(), &wh, sizeof(wave_hdr));
+ // the data:
+ memcpy(owav_data.data() + sizeof(wave_hdr), odata, osize* sizeof(s16));
+
+ return 0;
+}