CUDA/HIP: Share the same unified memory allocation logic. (#12934)

author David Huang <redacted>

Tue, 15 Apr 2025 09:20:38 +0000 (17:20 +0800)

committer GitHub <redacted>

Tue, 15 Apr 2025 09:20:38 +0000 (11:20 +0200)
author David Huang <redacted>
Tue, 15 Apr 2025 09:20:38 +0000 (17:20 +0800)
committer GitHub <redacted>
Tue, 15 Apr 2025 09:20:38 +0000 (11:20 +0200)
diff --git a/Makefile b/Makefile

index 1f9455eff0aec5c5ff48c58a895b1ca42cfb700b..772993ada2707bffb83b073a2f617d3b56b6553d 100644 (file)
--- a/Makefile
+++ b/Makefile
@@ -780,10 +780,6 @@ ifdef GGML_HIP
  
         MK_CPPFLAGS += -DGGML_USE_HIP -DGGML_USE_CUDA
  
-ifdef GGML_HIP_UMA
-       MK_CPPFLAGS += -DGGML_HIP_UMA
-endif # GGML_HIP_UMA
-
         MK_LDFLAGS += -L$(ROCM_PATH)/lib -Wl,-rpath=$(ROCM_PATH)/lib
         MK_LDFLAGS += -L$(ROCM_PATH)/lib64 -Wl,-rpath=$(ROCM_PATH)/lib64
         MK_LDFLAGS += -lhipblas -lamdhip64 -lrocblas
diff --git a/docs/build.md b/docs/build.md

index 3f1b043992545d5b78a324d80028170233bd31f0..c9027c0b580a5d800da9b46fffb7df7bb4240132 100644 (file)
--- a/docs/build.md
+++ b/docs/build.md
@@ -259,8 +259,6 @@ You can download it from your Linux distro's package manager or from here: [ROCm
        cmake -S . -B build -DGGML_HIP=ON -DAMDGPU_TARGETS=gfx1030 -DCMAKE_BUILD_TYPE=Release \
        && cmake --build build --config Release -- -j 16
    ```
-  On Linux it is also possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting `-DGGML_HIP_UMA=ON`.
-  However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
  
    To enhance flash attention performance on RDNA3+ or CDNA architectures, you can utilize the rocWMMA library by enabling the `-DGGML_HIP_ROCWMMA_FATTN=ON` option. This requires rocWMMA headers to be installed on the build system.
  
@@ -296,6 +294,10 @@ You can download it from your Linux distro's package manager or from here: [ROCm
  The environment variable [`HIP_VISIBLE_DEVICES`](https://rocm.docs.amd.com/en/latest/understand/gpu_isolation.html#hip-visible-devices) can be used to specify which GPU(s) will be used.
  If your GPU is not officially supported you can use the environment variable [`HSA_OVERRIDE_GFX_VERSION`] set to a similar GPU, for example 10.3.0 on RDNA2 (e.g. gfx1030, gfx1031, or gfx1035) or 11.0.0 on RDNA3.
  
+### Unified Memory
+
+On Linux it is possible to use unified memory architecture (UMA) to share main memory between the CPU and integrated GPU by setting environment variable `GGML_CUDA_ENABLE_UNIFIED_MEMORY=1`. However, this hurts performance for non-integrated GPUs (but enables working with integrated GPUs).
+
  ## Vulkan
  
  **Windows**
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt

index d33f843b417cfd8e03fd7556fb78fe3120eec4bd..438c2a7309191984613614afa068ab6fa3b70a1f 100644 (file)
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -170,7 +170,6 @@ option(GGML_HIP                             "ggml: use HIP"
  option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
  option(GGML_HIP_NO_VMM                      "ggml: do not try to use HIP VMM"                 ON)
  option(GGML_HIP_ROCWMMA_FATTN               "ggml: enable rocWMMA for FlashAttention"         OFF)
-option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
  option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
  option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
  option(GGML_VULKAN_DEBUG                    "ggml: enable Vulkan debug output"                OFF)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu

index 4af1897017567205d842dfa0ffca75d87de78e24..9ced4665127888bcf77b1cb62b90fd0054f8de63 100644 (file)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -96,31 +96,32 @@ int ggml_cuda_get_device() {
  
  static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
      ggml_cuda_set_device(device);
-#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
-    auto res = hipMallocManaged(ptr, size);
-    if (res == hipSuccess) {
-        // if error we "need" to know why...
-        CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
-    }
-    return res;
-#else
-
-#if !defined(GGML_USE_HIP)
      cudaError_t err;
      if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
      {
          err = cudaMallocManaged(ptr, size);
+#if defined(GGML_USE_HIP)
+        if (err == hipSuccess) {
+            CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
+        }
+
+        // fall back to cudaMalloc if not supported (e.g. on Windows)
+        if (err == hipErrorNotSupported) {
+            static bool warned_unsupported = false;
+            if (!warned_unsupported) {
+                GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");
+                warned_unsupported = true;
+            }
+
+            err = cudaMalloc(ptr, size);
+        }
+#endif // defined(GGML_USE_HIP)
      }
      else
      {
          err = cudaMalloc(ptr, size);
      }
      return err;
-#else
-    return cudaMalloc(ptr, size);
-#endif // !defined(GGML_USE_HIP)
-
-#endif
  }
  
  #if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h

index 420b41b8d652d6b6c15b780fd28e348a63643bf0..1a28831b7a96b74f880363a05450819cb2d66a5a 100644 (file)
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -71,6 +71,8 @@
  #define cudaLaunchHostFunc hipLaunchHostFunc
  #define cudaMalloc hipMalloc
  #define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#define cudaMallocManaged hipMallocManaged
+#define cudaMemAdvise hipMemAdvise
  #define cudaMemcpy hipMemcpy
  #define cudaMemcpyAsync hipMemcpyAsync
  #define cudaMemcpyPeerAsync hipMemcpyPeerAsync
diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt

index e3762649fd27574001ec1e5bb3f51cbdc739c2c8..1fe8fe3b8d079167267d7b4efb9d935a13b57c31 100644 (file)
--- a/ggml/src/ggml-hip/CMakeLists.txt
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@@ -89,10 +89,6 @@ endif()
  
  add_compile_definitions(GGML_USE_HIP)
  
-if (GGML_HIP_UMA)
-    add_compile_definitions(GGML_HIP_UMA)
-endif()
-
  if (GGML_CUDA_FORCE_MMQ)
      add_compile_definitions(GGML_CUDA_FORCE_MMQ)
  endif()
author	David Huang <redacted>
	Tue, 15 Apr 2025 09:20:38 +0000 (17:20 +0800)
committer	GitHub <redacted>
	Tue, 15 Apr 2025 09:20:38 +0000 (11:20 +0200)
Makefile		patch \| blob \| history
docs/build.md		patch \| blob \| history
ggml/CMakeLists.txt		patch \| blob \| history
ggml/src/ggml-cuda/ggml-cuda.cu		patch \| blob \| history
ggml/src/ggml-cuda/vendors/hip.h		patch \| blob \| history
ggml/src/ggml-hip/CMakeLists.txt		patch \| blob \| history