hip : Add hipGraph and VMM support to ROCM (llama/11362)

author uvos <redacted>

Fri, 24 Jan 2025 23:02:23 +0000 (00:02 +0100)

committer Georgi Gerganov <redacted>

Mon, 3 Feb 2025 20:00:57 +0000 (22:00 +0200)
author uvos <redacted>
Fri, 24 Jan 2025 23:02:23 +0000 (00:02 +0100)
committer Georgi Gerganov <redacted>
Mon, 3 Feb 2025 20:00:57 +0000 (22:00 +0200)
diff --git a/ggml/CMakeLists.txt b/ggml/CMakeLists.txt

index ff68ddc21d9fa573206280a51e39e2f599d981d4..123c755ace41650dd5b8d1b5fa59be481607e8f4 100644 (file)
--- a/ggml/CMakeLists.txt
+++ b/ggml/CMakeLists.txt
@@ -154,6 +154,7 @@ option(GGML_CUDA_FA_ALL_QUANTS              "ggml: compile all quants for FlashA
  option(GGML_CUDA_GRAPHS                     "ggml: use CUDA graphs (llama.cpp only)"          ${GGML_CUDA_GRAPHS_DEFAULT})
  
  option(GGML_HIP                             "ggml: use HIP"                                   OFF)
+option(GGML_HIP_GRAPHS                      "ggml: use HIP graph, experimental, slow"         OFF)
  option(GGML_HIP_UMA                         "ggml: use HIP unified memory architecture"       OFF)
  option(GGML_VULKAN                          "ggml: use Vulkan"                                OFF)
  option(GGML_VULKAN_CHECK_RESULTS            "ggml: run Vulkan op checks"                      OFF)
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh

index 2c0a562267281c21bca9072e4a56a4e52c7ddd57..a79fa83c5a24913e2ad421187f1c89c1d8f47ad1 100644 (file)
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -588,7 +588,7 @@ struct ggml_tensor_extra_gpu {
  };
  
  
-#if (CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)
+#if ((CUDART_VERSION >= 12000) && defined(GGML_CUDA_USE_GRAPHS)) || defined(GGML_HIP_GRAPHS)
  #define USE_CUDA_GRAPH
  #endif
  
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu

index fbe889a01221b01d3172ae93de34f0cf0ca5703f..a53a1bbd05d65c1fd2b73f83fc6d5fa89d19a58c 100644 (file)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -62,7 +62,7 @@ static_assert(sizeof(half) == sizeof(ggml_fp16_t), "wrong fp16 size");
  [[noreturn]]
  void ggml_cuda_error(const char * stmt, const char * func, const char * file, int line, const char * msg) {
      int id = -1; // in case cudaGetDevice fails
-    cudaGetDevice(&id);
+    (void)cudaGetDevice(&id);
  
      GGML_LOG_ERROR(GGML_CUDA_NAME " error: %s\n", msg);
      GGML_LOG_ERROR("  current device: %d, in function %s at %s:%d\n", id, func, file, line);
@@ -152,7 +152,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
      for (int id = 0; id < info.device_count; ++id) {
          int device_vmm = 0;
  
-#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_CUDA_NO_VMM)
          CUdevice device;
          CU_CHECK(cuDeviceGet(&device, id));
          CU_CHECK(cuDeviceGetAttribute(&device_vmm, CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED, device));
@@ -164,7 +164,7 @@ static ggml_cuda_device_info ggml_cuda_init() {
              alloc_prop.location.id = id;
              CU_CHECK(cuMemGetAllocationGranularity(&info.devices[id].vmm_granularity, &alloc_prop, CU_MEM_ALLOC_GRANULARITY_RECOMMENDED));
          }
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#endif // !defined(GGML_CUDA_NO_VMM)
          info.devices[id].vmm = !!device_vmm;
  
          cudaDeviceProp prop;
@@ -300,7 +300,7 @@ struct ggml_cuda_pool_leg : public ggml_cuda_pool {
  };
  
  // pool with virtual memory
-#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_CUDA_NO_VMM)
  struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
      static const size_t CUDA_POOL_VMM_MAX_SIZE = 1ull << 35; // 32 GB
  
@@ -309,6 +309,9 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
      size_t pool_used = 0;
      size_t pool_size = 0;
      size_t granularity;
+#if defined(GGML_USE_HIP)
+    std::vector<std::pair<CUdeviceptr, size_t>> mappings;
+#endif
  
      explicit ggml_cuda_pool_vmm(int device) :
          device(device),
@@ -317,7 +320,14 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
  
      ~ggml_cuda_pool_vmm() {
          if (pool_addr != 0) {
+#if defined(GGML_USE_HIP)
+            // Workaround for https://github.com/ROCm/ROCR-Runtime/issues/285
+            for (std::pair<CUdeviceptr, size_t> & mapping : mappings) {
+                CU_CHECK(cuMemUnmap(mapping.first, mapping.second));
+            }
+#else
              CU_CHECK(cuMemUnmap(pool_addr, pool_size));
+#endif
              CU_CHECK(cuMemAddressFree(pool_addr, CUDA_POOL_VMM_MAX_SIZE));
          }
      }
@@ -350,7 +360,11 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
              }
  
              // map at the end of the pool
-            CU_CHECK(cuMemMap(pool_addr + pool_size, reserve_size, 0, handle, 0));
+            CUdeviceptr start_ptr = (CUdeviceptr)((char *)(pool_addr) + pool_size);
+            CU_CHECK(cuMemMap(start_ptr, reserve_size, 0, handle, 0));
+#if defined(GGML_USE_HIP)
+            mappings.push_back({start_ptr, reserve_size});
+#endif
  
              // the memory allocation handle is no longer needed after mapping
              CU_CHECK(cuMemRelease(handle));
@@ -360,7 +374,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
              access.location.type = CU_MEM_LOCATION_TYPE_DEVICE;
              access.location.id = device;
              access.flags = CU_MEM_ACCESS_FLAGS_PROT_READWRITE;
-            CU_CHECK(cuMemSetAccess(pool_addr + pool_size, reserve_size, &access, 1));
+            CU_CHECK(cuMemSetAccess((CUdeviceptr)((char *)(pool_addr) + pool_size), reserve_size, &access, 1));
  
              // add to the pool
              pool_size += reserve_size;
@@ -372,7 +386,7 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
  
          GGML_ASSERT(pool_addr != 0);
  
-        void * ptr = (void *) (pool_addr + pool_used);
+        void * ptr = (void *) ((CUdeviceptr)((char *)(pool_addr) + pool_used));
          *actual_size = size;
          pool_used += size;
  
@@ -391,17 +405,17 @@ struct ggml_cuda_pool_vmm : public ggml_cuda_pool {
          pool_used -= size;
  
          // all deallocations must be in reverse order of the allocations
-        GGML_ASSERT(ptr == (void *) (pool_addr + pool_used));
+        GGML_ASSERT(ptr == (void *) ((char *)(pool_addr) + pool_used));
      }
  };
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#endif // !defined(GGML_CUDA_NO_VMM)
  
  std::unique_ptr<ggml_cuda_pool> ggml_backend_cuda_context::new_pool_for_device(int device) {
-#if !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#if !defined(GGML_CUDA_NO_VMM)
      if (ggml_cuda_info().devices[device].vmm) {
          return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_vmm(device));
      }
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_CUDA_NO_VMM)
+#endif // !defined(GGML_CUDA_NO_VMM)
      return std::unique_ptr<ggml_cuda_pool>(new ggml_cuda_pool_leg(device));
  }
  
@@ -547,7 +561,7 @@ static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_bac
      cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
      if (err != cudaSuccess) {
          // clear the error
-        cudaGetLastError();
+        (void)cudaGetLastError();
          GGML_LOG_ERROR("%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size / 1024.0 / 1024.0, buft_ctx->device, cudaGetErrorString(err));
          return nullptr;
      }
@@ -962,7 +976,7 @@ static void * ggml_cuda_host_malloc(size_t size) {
      cudaError_t err = cudaMallocHost((void **) &ptr, size);
      if (err != cudaSuccess) {
          // clear the error
-        cudaGetLastError();
+        (void)cudaGetLastError();
          GGML_LOG_DEBUG("%s: failed to allocate %.2f MiB of pinned memory: %s\n", __func__,
                             size / 1024.0 / 1024.0, cudaGetErrorString(err));
          return nullptr;
@@ -1209,7 +1223,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
                          CUDA_CHECK(err);
                      } else {
                          // reset the error
-                        cudaGetLastError();
+                        (void)cudaGetLastError();
                      }
                  } else {
                      cudaError_t err = cudaDeviceDisablePeerAccess(id_other);
@@ -1217,7 +1231,7 @@ static void ggml_cuda_set_peer_access(const int n_tokens, int main_device) {
                          CUDA_CHECK(err);
                      } else {
                          // reset the error
-                        cudaGetLastError();
+                        (void)cudaGetLastError();
                      }
                  }
              }
@@ -2452,7 +2466,7 @@ static void maintain_cuda_graph(ggml_backend_cuda_context * cuda_ctx, std::vecto
                      if (stat == cudaErrorInvalidDeviceFunction) {
                          // Fails due to incorrect handling by CUDA runtime of CUDA BLAS node.
                          // We don't need to update blas nodes, so clear error and move on.
-                        cudaGetLastError();
+                        (void)cudaGetLastError();
                      } else {
                          GGML_ASSERT(stat == cudaSuccess);
                      }
@@ -2507,14 +2521,20 @@ static bool is_cuda_graph_update_required(ggml_backend_cuda_context * cuda_ctx,
  static void update_cuda_graph_executable(ggml_backend_cuda_context * cuda_ctx) {
  
      cudaGraphExecUpdateResultInfo result_info;
+#ifdef __HIP_PLATFORM_AMD__
+    hipGraphNode_t errorNode;
+    hipError_t stat = hipGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &errorNode, &result_info);
+#else
      cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
+#endif
      if (stat == cudaErrorGraphExecUpdateFailure) {
  #ifndef NDEBUG
          GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
  #endif
+
          // The pre-existing graph exec cannot be updated due to violated constraints
          // so instead clear error and re-instantiate
-        cudaGetLastError();
+        (void)cudaGetLastError();
          CUDA_CHECK(cudaGraphExecDestroy(cuda_ctx->cuda_graph->instance));
          cuda_ctx->cuda_graph->instance = nullptr;
          CUDA_CHECK(cudaGraphInstantiate(&cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, NULL, NULL, 0));
@@ -2742,7 +2762,7 @@ bool ggml_backend_cuda_register_host_buffer(void * buffer, size_t size) {
      cudaError_t err = cudaHostRegister(buffer, size, cudaHostRegisterPortable | cudaHostRegisterReadOnly);
      if (err != cudaSuccess) {
          // clear the error
-        cudaGetLastError();
+        (void)cudaGetLastError();
  
          GGML_LOG_DEBUG("%s: failed to register %.2f MiB of pinned memory: %s\n", __func__,
                             size / 1024.0 / 1024.0, cudaGetErrorString(err));
@@ -2762,7 +2782,7 @@ void ggml_backend_cuda_unregister_host_buffer(void * buffer) {
      cudaError_t err = cudaHostUnregister(buffer);
      if (err != cudaSuccess) {
          // clear the error
-        cudaGetLastError();
+        (void)cudaGetLastError();
      }
  }
  
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h

index c905b15d76cc3c118564f66ed6295df2d82a5d2f..8594093f052ef1355321b95f94d1c724cd6e93bd 100644 (file)
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -19,6 +19,12 @@
  #define CUBLAS_TF32_TENSOR_OP_MATH 0
  #define CUDA_R_16F  HIPBLAS_R_16F
  #define CUDA_R_32F  HIPBLAS_R_32F
+#define CU_DEVICE_ATTRIBUTE_VIRTUAL_MEMORY_MANAGEMENT_SUPPORTED hipDeviceAttributeVirtualMemoryManagementSupported
+#define CU_MEM_ALLOC_GRANULARITY_RECOMMENDED hipMemAllocationGranularityRecommended
+#define CU_MEM_ALLOCATION_TYPE_PINNED hipMemAllocationTypePinned
+#define CU_MEM_LOCATION_TYPE_DEVICE hipMemLocationTypeDevice
+#define CU_MEM_ACCESS_FLAGS_PROT_READWRITE hipMemAccessFlagsProtReadWrite
+#define CU_CHECK(fn) {hipError_t err = fn; if(err != hipSuccess) { GGML_ABORT("HipVMM Failure: %s\n", hipGetErrorString(err)); }}
  #define __shfl_xor_sync(mask, var, laneMask, width) __shfl_xor(var, laneMask, width)
  #define cublasComputeType_t hipblasDatatype_t //deprecated, new hipblasComputeType_t not in 5.6
  #define cublasCreate hipblasCreate
@@ -74,6 +80,21 @@
  #define cudaMemGetInfo hipMemGetInfo
  #define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
  #define cudaSetDevice hipSetDevice
+#define cuDeviceGet hipDeviceGet
+#define CUdevice hipDevice_t
+#define CUdeviceptr hipDeviceptr_t
+#define cuMemUnmap hipMemUnmap
+#define CUmemAccessDesc hipMemAccessDesc
+#define cuMemAddressFree hipMemAddressFree
+#define cuMemRelease hipMemRelease
+#define CUmemGenericAllocationHandle hipMemGenericAllocationHandle_t
+#define cuMemCreate hipMemCreate
+#define cuMemAddressReserve hipMemAddressReserve
+#define cuMemMap hipMemMap
+#define cuMemSetAccess hipMemSetAccess
+#define cuMemGetAllocationGranularity hipMemGetAllocationGranularity
+#define CUmemAllocationProp hipMemAllocationProp
+#define cuDeviceGetAttribute hipDeviceGetAttribute
  #define cudaStreamCreateWithFlags hipStreamCreateWithFlags
  #define cudaStreamDestroy hipStreamDestroy
  #define cudaStreamFireAndForget hipStreamFireAndForget
@@ -81,6 +102,28 @@
  #define cudaStreamPerThread hipStreamPerThread
  #define cudaStreamSynchronize hipStreamSynchronize
  #define cudaStreamWaitEvent(stream, event, flags) hipStreamWaitEvent(stream, event, flags)
+#define cudaGraphExec_t hipGraphExec_t
+#define cudaGraphNode_t hipGraphNode_t
+#define cudaKernelNodeParams hipKernelNodeParams
+#define cudaKernelNodeParams hipKernelNodeParams
+#define cudaGraphExecDestroy hipGraphExecDestroy
+#define cudaGraphLaunch hipGraphLaunch
+#define cudaErrorGraphExecUpdateFailure hipErrorGraphExecUpdateFailure
+#define cudaGraphExecUpdateResultInfo hipGraphExecUpdateResult
+#define cudaGraphNodeType hipGraphNodeType
+#define cudaGraphNodeTypeKernel hipGraphNodeTypeKernel
+#define cudaGraphInstantiate hipGraphInstantiate
+#define cudaStreamEndCapture hipStreamEndCapture
+#define cudaGraphDestroy hipGraphDestroy
+#define cudaGraphKernelNodeSetParams hipGraphKernelNodeSetParams
+#define cudaErrorInvalidDeviceFunction hipErrorInvalidDeviceFunction
+#define cudaGraphKernelNodeGetParams hipGraphKernelNodeGetParams
+#define cudaGraphNodeGetType hipGraphNodeGetType
+#define cudaGraphGetNodes hipGraphGetNodes
+#define cudaGraphExecUpdate hipGraphExecUpdate
+#define cudaStreamCaptureModeRelaxed hipStreamCaptureModeRelaxed
+#define cudaStreamBeginCapture hipStreamBeginCapture
+#define cudaGraph_t hipGraph_t
  #define cudaStream_t hipStream_t
  #define cudaSuccess hipSuccess
  #define __trap() do { abort(); __builtin_unreachable(); } while(0)
diff --git a/ggml/src/ggml-hip/CMakeLists.txt b/ggml/src/ggml-hip/CMakeLists.txt

index d090ba9bd98fe8adf0a8d350cb787cb8e19ea569..77994a6986bdc13a2e10453e209dc27635f4e229 100644 (file)
--- a/ggml/src/ggml-hip/CMakeLists.txt
+++ b/ggml/src/ggml-hip/CMakeLists.txt
@@ -92,6 +92,14 @@ if (GGML_CUDA_NO_PEER_COPY)
      add_compile_definitions(GGML_CUDA_NO_PEER_COPY)
  endif()
  
+if (GGML_HIP_GRAPHS)
+    add_compile_definitions(GGML_HIP_GRAPHS)
+endif()
+
+if (GGML_CUDA_NO_VMM)
+    add_compile_definitions(GGML_CUDA_NO_VMM)
+endif()
+
  if (CXX_IS_HIPCC)
      set_source_files_properties(${GGML_SOURCES_ROCM} PROPERTIES LANGUAGE CXX)
      target_link_libraries(ggml-hip PRIVATE hip::device)
author	uvos <redacted>
	Fri, 24 Jan 2025 23:02:23 +0000 (00:02 +0100)
committer	Georgi Gerganov <redacted>
	Mon, 3 Feb 2025 20:00:57 +0000 (22:00 +0200)
ggml/CMakeLists.txt		patch \| blob \| history
ggml/src/ggml-cuda/common.cuh		patch \| blob \| history
ggml/src/ggml-cuda/ggml-cuda.cu		patch \| blob \| history
ggml/src/ggml-cuda/vendors/hip.h		patch \| blob \| history
ggml/src/ggml-hip/CMakeLists.txt		patch \| blob \| history