option(GGML_HIP_GRAPHS "ggml: use HIP graph, experimental, slow" OFF)
option(GGML_HIP_NO_VMM "ggml: do not try to use HIP VMM" ON)
option(GGML_HIP_ROCWMMA_FATTN "ggml: enable rocWMMA for FlashAttention" OFF)
-option(GGML_HIP_UMA "ggml: use HIP unified memory architecture" OFF)
option(GGML_VULKAN "ggml: use Vulkan" OFF)
option(GGML_VULKAN_CHECK_RESULTS "ggml: run Vulkan op checks" OFF)
option(GGML_VULKAN_DEBUG "ggml: enable Vulkan debug output" OFF)
static cudaError_t ggml_cuda_device_malloc(void ** ptr, size_t size, int device) {
ggml_cuda_set_device(device);
-#if defined(GGML_USE_HIP) && defined(GGML_HIP_UMA)
- auto res = hipMallocManaged(ptr, size);
- if (res == hipSuccess) {
- // if error we "need" to know why...
- CUDA_CHECK(hipMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
- }
- return res;
-#else
-
-#if !defined(GGML_USE_HIP)
cudaError_t err;
if (getenv("GGML_CUDA_ENABLE_UNIFIED_MEMORY") != nullptr)
{
err = cudaMallocManaged(ptr, size);
+#if defined(GGML_USE_HIP)
+ if (err == hipSuccess) {
+ CUDA_CHECK(cudaMemAdvise(*ptr, size, hipMemAdviseSetCoarseGrain, device));
+ }
+
+ // fall back to cudaMalloc if not supported (e.g. on Windows)
+ if (err == hipErrorNotSupported) {
+ static bool warned_unsupported = false;
+ if (!warned_unsupported) {
+ GGML_LOG_WARN("hipMallocManaged unsupported, falling back to hipMalloc.\n");
+ warned_unsupported = true;
+ }
+
+ err = cudaMalloc(ptr, size);
+ }
+#endif // defined(GGML_USE_HIP)
}
else
{
err = cudaMalloc(ptr, size);
}
return err;
-#else
- return cudaMalloc(ptr, size);
-#endif // !defined(GGML_USE_HIP)
-
-#endif
}
#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
#define cudaLaunchHostFunc hipLaunchHostFunc
#define cudaMalloc hipMalloc
#define cudaMallocHost(ptr, size) hipHostMalloc(ptr, size, hipHostMallocDefault)
+#define cudaMallocManaged hipMallocManaged
+#define cudaMemAdvise hipMemAdvise
#define cudaMemcpy hipMemcpy
#define cudaMemcpyAsync hipMemcpyAsync
#define cudaMemcpyPeerAsync hipMemcpyPeerAsync