server : automatically convert audio on the server (#1539)

author Ismatulla Mansurov <redacted>

Mon, 27 Nov 2023 09:28:34 +0000 (02:28 -0700)

committer GitHub <redacted>

Mon, 27 Nov 2023 09:28:34 +0000 (11:28 +0200)
author Ismatulla Mansurov <redacted>
Mon, 27 Nov 2023 09:28:34 +0000 (02:28 -0700)
committer GitHub <redacted>
Mon, 27 Nov 2023 09:28:34 +0000 (11:28 +0200)
diff --git a/examples/server/README.md b/examples/server/README.md

index 2a43541f32f914db39dc7c30687e78d609930388..84b85e87e5f797c1487c83e0a1a34b723f8838b6 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -43,8 +43,12 @@ options:
    -oved D,   --ov-e-device DNAME [CPU    ] the OpenVINO device used for encode inference
    --host HOST,                   [127.0.0.1] Hostname/ip-adress for the server
    --port PORT,                   [8080   ] Port number for the server
+  --convert,                     [false  ] Convert audio to WAV, requires ffmpeg on the server
  ```
  
+> [!WARNING]  
+> **Do not run the server example with administrative privileges and ensure it's operated in a sandbox environment, especially since it involves risky operations like accepting user file uploads and using ffmpeg for format conversions. Always validate and sanitize inputs to guard against potential security threats.**
+
  ## request examples
  
  **/inference**
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index ae0b7a26be9270afff5878efd1887afe10f55c5a..dc5317a0e1c6e39f49e9ed9a7800627c6a8a785b 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -43,6 +43,8 @@ struct server_params
      int32_t port          = 8080;
      int32_t read_timeout  = 600;
      int32_t write_timeout = 600;
+    
+    bool ffmpeg_converter = false;
  };
  
  struct whisper_params {
@@ -157,6 +159,7 @@ void whisper_print_usage(int /*argc*/, char ** argv, const whisper_params & para
      fprintf(stderr, "  --host HOST,                   [%-7s] Hostname/ip-adress for the server\n", sparams.hostname.c_str());
      fprintf(stderr, "  --port PORT,                   [%-7d] Port number for the server\n", sparams.port);
      fprintf(stderr, "  --public PATH,                 [%-7s] Path to the public folder\n", sparams.public_path.c_str());
+    fprintf(stderr, "  --convert,                     [%-7s] Convert audio to WAV, requires ffmpeg on the server", sparams.ffmpeg_converter ? "true" : "false");
      fprintf(stderr, "\n");
  }
  
@@ -203,6 +206,7 @@ bool whisper_params_parse(int argc, char ** argv, whisper_params & params, serve
          else if (                  arg == "--port")            { sparams.port        = std::stoi(argv[++i]); }
          else if (                  arg == "--host")            { sparams.hostname    = argv[++i]; }
          else if (                  arg == "--public")          { sparams.public_path = argv[++i]; }
+        else if (                  arg == "--convert")         { sparams.ffmpeg_converter     = true; }
          else {
              fprintf(stderr, "error: unknown argument: %s\n", arg.c_str());
              whisper_print_usage(argc, argv, params, sparams);
@@ -220,6 +224,45 @@ struct whisper_print_user_data {
      int progress_prev;
  };
  
+void check_ffmpeg_availibility() {
+    int result = system("ffmpeg -version");
+
+    if (result == 0) {
+        std::cout << "ffmpeg is available." << std::endl;
+    } else {
+        // ffmpeg is not available
+        std::cout << "ffmpeg is not found. Please ensure that ffmpeg is installed ";
+        std::cout << "and that its executable is included in your system's PATH. ";
+        exit(0);
+    }
+}
+
+bool convert_to_wav(const std::string & temp_filename, std::string & error_resp) {
+    std::ostringstream cmd_stream;
+    std::string converted_filename_temp = temp_filename + "_temp.wav";
+    cmd_stream << "ffmpeg -i \"" << temp_filename << "\" -ar 16000 -ac 1 -c:a pcm_s16le \"" << converted_filename_temp << "\" 2>&1";
+    std::string cmd = cmd_stream.str();
+
+    int status = std::system(cmd.c_str());
+    if (status != 0) {
+        error_resp = "{\"error\":\"FFmpeg conversion failed.\"}";
+        return false;
+    }
+
+    // Remove the original file
+    if (remove(temp_filename.c_str()) != 0) {
+        error_resp = "{\"error\":\"Failed to remove the original file.\"}";
+        return false;
+    }
+
+    // Rename the temporary file to match the original filename
+    if (rename(converted_filename_temp.c_str(), temp_filename.c_str()) != 0) {
+        error_resp = "{\"error\":\"Failed to rename the temporary file.\"}";
+        return false;
+    }
+    return true;
+}
+
  std::string estimate_diarization_speaker(std::vector<std::vector<float>> pcmf32s, int64_t t0, int64_t t1, bool id_only = false) {
      std::string speaker = "";
      const int64_t n_samples = pcmf32s[0].size();
@@ -407,6 +450,9 @@ int main(int argc, char ** argv) {
          exit(0);
      }
  
+    if (sparams.ffmpeg_converter) {
+        check_ffmpeg_availibility();
+    }
      // whisper init
      struct whisper_context_params cparams;
      cparams.use_gpu = params.use_gpu;
@@ -462,6 +508,18 @@ int main(int argc, char ** argv) {
          temp_file << audio_file.content;
          temp_file.close();
  
+        // if file is not wav, convert to wav
+        
+        if (sparams.ffmpeg_converter) {
+            std::string error_resp = "{\"error\":\"Failed to execute ffmpeg command.\"}";
+            const bool is_converted = convert_to_wav(temp_filename, error_resp);
+            if (!is_converted) {
+                res.set_content(error_resp, "application/json");
+                whisper_mutex.unlock();
+                return;
+            }
+        }
+
          // read wav content into pcmf32
          if (!::read_wav(temp_filename, pcmf32, pcmf32s, params.diarize)) {
              fprintf(stderr, "error: failed to read WAV file '%s'\n", temp_filename.c_str());
@@ -509,7 +567,6 @@ int main(int argc, char ** argv) {
  
          // run the inference
          {
-
              printf("Running whisper.cpp inference on %s\n", filename.c_str());
              whisper_full_params wparams = whisper_full_default_params(WHISPER_SAMPLING_GREEDY);
author	Ismatulla Mansurov <redacted>
	Mon, 27 Nov 2023 09:28:34 +0000 (02:28 -0700)
committer	GitHub <redacted>
	Mon, 27 Nov 2023 09:28:34 +0000 (11:28 +0200)
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history