Add an option to build without CUDA VMM (#7067)

author William Tambellini <redacted>

Mon, 6 May 2024 18:12:14 +0000 (11:12 -0700)

committer GitHub <redacted>

Mon, 6 May 2024 18:12:14 +0000 (20:12 +0200)
author William Tambellini <redacted>
Mon, 6 May 2024 18:12:14 +0000 (11:12 -0700)
committer GitHub <redacted>
Mon, 6 May 2024 18:12:14 +0000 (20:12 +0200)
diff --git a/CMakeLists.txt b/CMakeLists.txt

index 477c5b57c20e7deb7fdfebd277c5ad05d1c7b397..0e22ee23018788134dc19e60486237a9de3ffe7f 100644 (file)
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -103,6 +103,8 @@ set(LLAMA_CUDA_KQUANTS_ITER "2" CACHE STRING "llama: iters./thread per block for
  set(LLAMA_CUDA_PEER_MAX_BATCH_SIZE "128" CACHE STRING
                                               "llama: max. batch size for using peer access")
  option(LLAMA_CUDA_NO_PEER_COPY               "llama: do not use peer to peer copies"            OFF)
+option(LLAMA_CUDA_NO_VMM                     "llama: do not try to use CUDA VMM"                OFF)
+
  option(LLAMA_CURL                            "llama: use libcurl to download model from an URL" OFF)
  option(LLAMA_HIPBLAS                         "llama: use hipBLAS"                               OFF)
  option(LLAMA_HIP_UMA                         "llama: use HIP unified memory architecture"       OFF)
@@ -409,6 +411,9 @@ if (LLAMA_CUDA)
          if (LLAMA_CUDA_FORCE_MMQ)
              add_compile_definitions(GGML_CUDA_FORCE_MMQ)
          endif()
+        if (LLAMA_CUDA_NO_VMM)
+            add_compile_definitions(GGML_CUDA_NO_VMM)
+        endif()
          add_compile_definitions(GGML_CUDA_DMMV_X=${LLAMA_CUDA_DMMV_X})
          add_compile_definitions(GGML_CUDA_MMV_Y=${LLAMA_CUDA_MMV_Y})
          if (DEFINED LLAMA_CUDA_DMMV_Y)
@@ -434,7 +439,11 @@ if (LLAMA_CUDA)
              set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
          endif()
  
-        set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver)
+        if (LLAMA_CUDA_NO_VMM)
+            # No VMM requested, no need to link directly with the cuda driver lib (libcuda.so)
+        else()
+            set(LLAMA_EXTRA_LIBS ${LLAMA_EXTRA_LIBS} CUDA::cuda_driver) # required by cuDeviceGetAttribute(), cuMemGetAllocationGranularity(...), ...
+        endif()
  
      if (NOT DEFINED CMAKE_CUDA_ARCHITECTURES)
          # 52 == lowest CUDA 12 standard
diff --git a/ggml-cuda.cu b/ggml-cuda.cu

index c30554f0cbe378e9b1102a56c9f697504395b955..2d1742c82a265ae07b0bb4a713e8d9a2c4de64f3 100644 (file)
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -113,7 +113,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
      for (int id = 0; id < info.device_count; ++id) {
          int device_vmm = 0;
  
-#if !defined(GGML_USE_HIPBLAS)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
          CUdevice device;
          CU_CHECK(cuDeviceGet(&device, id));
          CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -259,7 +259,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
  };
  
  // pool with virtual memory
-#if !defined(GGML_USE_HIPBLAS)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
  struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
      static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
  
@@ -356,7 +356,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
  #endif // !defined(GGML_USE_HIPBLAS)
  
  std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
-#if !defined(GGML_USE_HIPBLAS)
+#if !defined(GGML_USE_HIPBLAS) && !defined(GGML_CUDA_NO_VMM)
      if (ggml_cuda_info().devices[device].vmm) {
          return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
      }
author	William Tambellini <redacted>
	Mon, 6 May 2024 18:12:14 +0000 (11:12 -0700)
committer	GitHub <redacted>
	Mon, 6 May 2024 18:12:14 +0000 (20:12 +0200)
CMakeLists.txt		patch \| blob \| history
ggml-cuda.cu		patch \| blob \| history