whisper : add GPU support via cuBLAS (#834)

author Georgi Gerganov <redacted>

Sun, 30 Apr 2023 09:14:33 +0000 (12:14 +0300)

committer GitHub <redacted>

Sun, 30 Apr 2023 09:14:33 +0000 (12:14 +0300)
author Georgi Gerganov <redacted>
Sun, 30 Apr 2023 09:14:33 +0000 (12:14 +0300)
committer GitHub <redacted>
Sun, 30 Apr 2023 09:14:33 +0000 (12:14 +0300)
diff --git a/.gitignore b/.gitignore

index 67ec7c32408eaa08b4c8f55d682262653eae544b..4889306c901aff2698e399d82af0c38d6ea00628 100644 (file)
--- a/.gitignore
+++ b/.gitignore
@@ -12,6 +12,7 @@ build-em/
  build-debug/
  build-release/
  build-static/
+build-cublas/
  build-no-accel/
  build-sanitize-addr/
  build-sanitize-thread/
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 27013703c206b1927ed23d025fceb0aa6c592a14..6710ff27852f447bdae94b0763781f9b64d37d54 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -51,7 +51,7 @@ option(WHISPER_SANITIZE_UNDEFINED     "whisper: enable undefined sanitizer" OFF)
  option(WHISPER_BUILD_TESTS            "whisper: build tests"    ${WHISPER_STANDALONE})
  option(WHISPER_BUILD_EXAMPLES         "whisper: build examples" ${WHISPER_STANDALONE})
  
-option(WHISPER_SUPPORT_SDL2           "whisper: support for libSDL2" OFF)
+option(WHISPER_SDL2                   "whisper: support for libSDL2" OFF)
  
  if (APPLE)
      option(WHISPER_NO_ACCELERATE         "whisper: disable Accelerate framework" OFF)
@@ -62,7 +62,8 @@ if (APPLE)
      option(WHISPER_COREML                "whisper: enable Core ML framework" OFF)
      option(WHISPER_COREML_ALLOW_FALLBACK "whisper: allow non-CoreML fallback" OFF)
  else()
-    option(WHISPER_SUPPORT_OPENBLAS      "whisper: support for OpenBLAS" OFF)
+    option(WHISPER_OPENBLAS              "whisper: support for OpenBLAS" OFF)
+    option(WHISPER_CUBLAS                "whisper: support for cuBLAS" OFF)
  endif()
  
  option(WHISPER_PERF "whisper: enable perf timings" OFF)
@@ -127,7 +128,7 @@ if (APPLE)
      endif()
  endif()
  
-if (WHISPER_SUPPORT_OPENBLAS)
+if (WHISPER_OPENBLAS)
      find_library(OPENBLAS_LIB
          NAMES openblas libopenblas
          )
@@ -141,6 +142,31 @@ if (WHISPER_SUPPORT_OPENBLAS)
      endif()
  endif()
  
+if (WHISPER_CUBLAS)
+    cmake_minimum_required(VERSION 3.17)
+
+    find_package(CUDAToolkit)
+
+    if (CUDAToolkit_FOUND)
+        message(STATUS "cuBLAS found")
+
+        enable_language(CUDA)
+
+        set(GGML_CUDA_SOURCES ggml-cuda.cu ggml-cuda.h)
+
+        add_compile_definitions(GGML_USE_CUBLAS)
+
+        if (WHISPER_STATIC)
+            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart_static CUDA::cublas_static CUDA::cublasLt_static)
+        else()
+            set(WHISPER_EXTRA_LIBS ${WHISPER_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
+        endif()
+
+    else()
+        message(WARNING "cuBLAS not found")
+    endif()
+endif()
+
  # compiler flags
  
  if (NOT CMAKE_BUILD_TYPE AND NOT CMAKE_CONFIGURATION_TYPES)
@@ -247,6 +273,7 @@ set(TARGET whisper)
  add_library(${TARGET}
      ggml.h
      ggml.c
+    ${GGML_CUDA_SOURCES}
      whisper.h
      whisper.cpp
      )
@@ -279,6 +306,12 @@ if (BUILD_SHARED_LIBS)
          )
  endif()
  
+if (GGML_CUDA_SOURCES)
+    message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
+    set_property(TARGET whisper PROPERTY CUDA_ARCHITECTURES OFF)
+    set_property(TARGET whisper PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
+endif()
+
  if (EMSCRIPTEN)
      set_target_properties(${TARGET} PROPERTIES COMPILE_FLAGS "-msimd128")
  endif()
diff --git a/Makefile b/Makefile

index 4282df0ac3131ac27a5ad91dcd5877ee02c20c95..413a681a7ec70bdc19d2cce5db1e353b8cc44fa2 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -1,3 +1,5 @@
+default: main bench
+
  ifndef UNAME_S
  UNAME_S := $(shell uname -s)
  endif
@@ -157,6 +159,18 @@ ifdef WHISPER_OPENBLAS
         LDFLAGS += -lopenblas
  endif
  
+ifdef WHISPER_CUBLAS
+       CFLAGS      += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+       CXXFLAGS    += -DGGML_USE_CUBLAS -I/usr/local/cuda/include -I/opt/cuda/include -I$(CUDA_PATH)/targets/x86_64-linux/include
+       LDFLAGS     += -lcublas -lculibos -lcudart -lcublasLt -lpthread -ldl -lrt -L/usr/local/cuda/lib64 -L/opt/cuda/lib64 -L$(CUDA_PATH)/targets/x86_64-linux/lib
+       WHISPER_OBJ += ggml-cuda.o
+       NVCC        = nvcc
+       NVCCFLAGS   = --forward-unknown-to-host-compiler -arch=native
+
+ggml-cuda.o: ggml-cuda.cu ggml-cuda.h
+       $(NVCC) $(NVCCFLAGS) $(CXXFLAGS) -Wno-pedantic -c $< -o $@
+endif
+
  ifdef WHISPER_GPROF
         CFLAGS   += -pg
         CXXFLAGS += -pg
@@ -200,20 +214,18 @@ $(info I CC:       $(CCV))
  $(info I CXX:      $(CXXV))
  $(info )
  
-default: main bench
-
  #
  # Build library
  #
  
-ggml.o: ggml.c ggml.h
-       $(CC)  $(CFLAGS)   -c ggml.c -o ggml.o
+ggml.o: ggml.c ggml.h ggml-cuda.h
+       $(CC)  $(CFLAGS)   -c $< -o $@
  
-whisper.o: whisper.cpp whisper.h ggml.h
-       $(CXX) $(CXXFLAGS) -c whisper.cpp -o whisper.o
+whisper.o: whisper.cpp whisper.h ggml.h ggml-cuda.h
+       $(CXX) $(CXXFLAGS) -c $< -o $@
  
  ifndef WHISPER_COREML
-WHISPER_OBJ = whisper.o
+WHISPER_OBJ += whisper.o
  else
  whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
         $(CXX) -O3 -I . -c coreml/whisper-encoder.mm -o whisper-encoder.o
@@ -221,7 +233,7 @@ whisper-encoder.o: coreml/whisper-encoder.mm coreml/whisper-encoder.h
  whisper-encoder-impl.o: coreml/whisper-encoder-impl.m coreml/whisper-encoder-impl.h
         $(CXX) -O3 -I . -fobjc-arc -c coreml/whisper-encoder-impl.m -o whisper-encoder-impl.o
  
-WHISPER_OBJ = whisper.o whisper-encoder.o whisper-encoder-impl.o
+WHISPER_OBJ += whisper.o whisper-encoder.o whisper-encoder-impl.o
  endif
  
  libwhisper.a: ggml.o $(WHISPER_OBJ)
diff --git a/README.md b/README.md

index c68025883e619f0ab3cb474db91867ee1bffb0dd..dd54d04abb7632a492033b360cbdfa8ccd42f848 100644 (file)
--- a/README.md
+++ b/README.md
@@ -18,6 +18,7 @@ High-performance inference of [OpenAI's Whisper](https://github.com/openai/whisp
  - Low memory usage (Flash Attention)
  - Zero memory allocations at runtime
  - Runs on the CPU
+- [Partial GPU support for NVIDIA via cuBLAS](https://github.com/ggerganov/whisper.cpp#nvidia-gpu-support-via-cublas)
  - [C-style API](https://github.com/ggerganov/whisper.cpp/blob/master/whisper.h)
  
  Supported platforms:
@@ -254,7 +255,7 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
    # using Makefile
    make clean
    WHISPER_COREML=1 make -j
-  
+
    # using CMake
    cd build
    cmake -DWHISPER_COREML=1 ..
@@ -271,20 +272,33 @@ speed-up - more than x3 faster compared with CPU-only execution. Here are the in
    whisper_init_state: first run on a device may take a while ...
    whisper_init_state: Core ML model loaded
  
-  system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | COREML = 1 | 
+  system_info: n_threads = 4 / 10 | AVX = 0 | AVX2 = 0 | AVX512 = 0 | FMA = 0 | NEON = 1 | ARM_FMA = 1 | F16C = 0 | FP16_VA = 1 | WASM_SIMD = 0 | BLAS = 1 | SSE3 = 0 | VSX = 0 | COREML = 1 |
  
    ...
    ```
  
    The first run on a device is slow, since the ANE service compiles the Core ML model to some device-specific format.
    Next runs are faster.
-  
+
  For more information about the Core ML implementation please refer to PR [#566](https://github.com/ggerganov/whisper.cpp/pull/566).
-  
+
+## NVIDIA GPU support via cuBLAS
+
+With NVIDIA cards, the Encoder processing can be offloaded to the GPU to a large extend through cuBLAS.
+First, make sure you have installed `cuda`: https://developer.nvidia.com/cuda-downloads
+
+Now build `whisper.cpp` with cuBLAS support:
+
+```
+make clean
+WHISPER_CUBLAS=1 make -j
+```
+
+Run all the examples as usual.
+
  ## Limitations
  
  - Inference only
-- No GPU support (yet)
  
  ## Another example
  
@@ -429,7 +443,7 @@ system_info: n_threads = 4 / 10 | AVX2 = 0 | AVX512 = 0 | NEON = 1 | FP16_VA = 1
  
  main: processing './samples/jfk.wav' (176000 samples, 11.0 sec), 4 threads, 1 processors, lang = en, task = transcribe, timestamps = 1 ...
  
-[00:00:00.000 --> 00:00:00.320]  
+[00:00:00.000 --> 00:00:00.320]
  [00:00:00.320 --> 00:00:00.370]   And
  [00:00:00.370 --> 00:00:00.690]   so
  [00:00:00.690 --> 00:00:00.850]   my
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt

index 6687824f7269a74dd5559bc658459f82b0d1e1f5..a2ef07c9c0ff5ed4a5015304582c641a04c9cf19 100644 (file)
--- a/examples/CMakeLists.txt
+++ b/examples/CMakeLists.txt
@@ -4,7 +4,7 @@ find_package(Threads REQUIRED)
  
  # third-party
  
-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
      # SDL2
      find_package(SDL2 REQUIRED)
  
@@ -27,7 +27,7 @@ include(DefaultTargetOptions)
  
  set_target_properties(${TARGET} PROPERTIES POSITION_INDEPENDENT_CODE ON)
  
-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
      # common-sdl
  
      set(TARGET common-sdl)
diff --git a/examples/command/CMakeLists.txt b/examples/command/CMakeLists.txt

index 5a97f3c271b6b9065861aef9535c85dd78212488..40f278c1813326849c827f76e7b885d895ebdeb6 100644 (file)
--- a/examples/command/CMakeLists.txt
+++ b/examples/command/CMakeLists.txt
@@ -1,4 +1,4 @@
-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
      # command
      set(TARGET command)
      add_executable(${TARGET} command.cpp)
diff --git a/examples/stream/CMakeLists.txt b/examples/stream/CMakeLists.txt

index 49dea54cf2193b6d3244ecc99cd6174aa4cf501a..312d52c6b0dad445c0be2d059bd4ed8073b8905f 100644 (file)
--- a/examples/stream/CMakeLists.txt
+++ b/examples/stream/CMakeLists.txt
@@ -1,4 +1,4 @@
-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
      # stream
      set(TARGET stream)
      add_executable(${TARGET} stream.cpp)
diff --git a/examples/talk-llama/CMakeLists.txt b/examples/talk-llama/CMakeLists.txt

index f25d06888f9302b3284be9c4f3a8f5e5f8a91e9c..cbdfb4177ec3c43367dfdd6c9c5b97914d7fc1ef 100644 (file)
--- a/examples/talk-llama/CMakeLists.txt
+++ b/examples/talk-llama/CMakeLists.txt
@@ -1,4 +1,4 @@
-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
      # talk-llama
      set(TARGET talk-llama)
      #add_executable(${TARGET} talk-llama.cpp llama.cpp)
diff --git a/examples/talk/CMakeLists.txt b/examples/talk/CMakeLists.txt

index 31166f62f16bb545d5e61c82dd9dec0c2848a643..c829ec5e017e13ed992db1955b657bf2c844ddf4 100644 (file)
--- a/examples/talk/CMakeLists.txt
+++ b/examples/talk/CMakeLists.txt
@@ -1,4 +1,4 @@
-if (WHISPER_SUPPORT_SDL2)
+if (WHISPER_SDL2)
      # talk
      set(TARGET talk)
      #add_executable(${TARGET} talk.cpp gpt-2.cpp)
diff --git a/whisper.cpp b/whisper.cpp

index 2c489b92466f3712df02b2b2480fafb2e9b1ee9e..ebeaa4b45564e6e8e2d1615484089a72447d6eaf 100644 (file)
--- a/whisper.cpp
+++ b/whisper.cpp
@@ -102,7 +102,7 @@ static void byteswap_tensor(ggml_tensor * tensor) {
  #define WHISPER_PRINT_DEBUG(...)
  #endif
  
-#define WHISPER_USE_FLASH_ATTN
+//#define WHISPER_USE_FLASH_ATTN
  //#define WHISPER_USE_FLASH_FF
  #define WHISPER_MAX_DECODERS 16
  
@@ -224,11 +224,11 @@ static const std::map<std::string, std::pair<int, std::string>> g_lang = {
  static const size_t MB = 1ull*1024*1024;
  
  static const std::map<e_model, size_t> MEM_REQ_SCRATCH0 = {
-    { MODEL_TINY,     14ull*MB },
-    { MODEL_BASE,     18ull*MB },
-    { MODEL_SMALL,    28ull*MB },
-    { MODEL_MEDIUM,   36ull*MB },
-    { MODEL_LARGE,    44ull*MB },
+    { MODEL_TINY,     62ull*MB },
+    { MODEL_BASE,     80ull*MB },
+    { MODEL_SMALL,   120ull*MB },
+    { MODEL_MEDIUM,  158ull*MB },
+    { MODEL_LARGE,   198ull*MB },
  };
  
  static const std::map<e_model, size_t> MEM_REQ_SCRATCH1 = {
@@ -280,11 +280,11 @@ static const std::map<e_model, size_t> MEM_REQ_KV_CROSS = {
  };
  
  static const std::map<e_model, size_t> MEM_REQ_ENCODE = {
-    { MODEL_TINY,      6ull*MB },
-    { MODEL_BASE,      8ull*MB },
-    { MODEL_SMALL,    13ull*MB },
-    { MODEL_MEDIUM,   22ull*MB },
-    { MODEL_LARGE,    33ull*MB },
+    { MODEL_TINY,     30ull*MB },
+    { MODEL_BASE,     38ull*MB },
+    { MODEL_SMALL,    56ull*MB },
+    { MODEL_MEDIUM,   74ull*MB },
+    { MODEL_LARGE,    94ull*MB },
  };
  
  static const std::map<e_model, size_t> MEM_REQ_DECODE = {
@@ -1554,26 +1554,17 @@ static bool whisper_encode_internal(
  
                  struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_scaled);
  
-                //struct ggml_tensor * V_trans =
-                //    ggml_permute(ctx0,
-                //            ggml_cpy(ctx0,
-                //                Vcur,
-                //                ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_head, n_ctx)),
-                //            1, 2, 0, 3);
-
-                //struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
-
                  struct ggml_tensor * V =
                      ggml_cpy(ctx0,
                              ggml_permute(ctx0,
                                  ggml_reshape_3d(ctx0,
                                      Vcur,
                                      n_state/n_head, n_head, n_ctx),
-                                0, 2, 1, 3),
-                            ggml_new_tensor_3d(ctx0, wctx.wtype, n_state/n_head, n_ctx, n_head)
+                                1, 2, 0, 3),
+                            ggml_new_tensor_3d(ctx0, wctx.wtype, n_ctx, n_state/n_head, n_head)
                              );
  
-                struct ggml_tensor * KQV = ggml_mul_mat(ctx0, ggml_transpose(ctx0, V), KQ_soft_max);
+                struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V, KQ_soft_max);
  #endif
                  struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
author	Georgi Gerganov <redacted>
	Sun, 30 Apr 2023 09:14:33 +0000 (12:14 +0300)
committer	GitHub <redacted>
	Sun, 30 Apr 2023 09:14:33 +0000 (12:14 +0300)
.gitignore		patch \| blob \| history
CMakeLists.txt		patch \| blob \| history
Makefile		patch \| blob \| history
README.md		patch \| blob \| history
examples/CMakeLists.txt		patch \| blob \| history
examples/command/CMakeLists.txt		patch \| blob \| history
examples/stream/CMakeLists.txt		patch \| blob \| history
examples/talk-llama/CMakeLists.txt		patch \| blob \| history
examples/talk/CMakeLists.txt		patch \| blob \| history
whisper.cpp		patch \| blob \| history