CUDA/HIP: add support for selectable warp size to mmv (#11519)

author uvos <redacted>

Sun, 2 Feb 2025 21:40:09 +0000 (22:40 +0100)

committer GitHub <redacted>

Sun, 2 Feb 2025 21:40:09 +0000 (22:40 +0100)
author uvos <redacted>
Sun, 2 Feb 2025 21:40:09 +0000 (22:40 +0100)
committer GitHub <redacted>
Sun, 2 Feb 2025 21:40:09 +0000 (22:40 +0100)
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh

index 232163c1c6fc128d409e89ce3c7cc454c9d8aedd..174916bc970d7b5d4a41a7cf1b3eb22b8bc63919 100644 (file)
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -176,6 +176,14 @@ static constexpr bool new_mma_available(const int cc) {
      return cc < GGML_CUDA_CC_OFFSET_AMD && cc >= GGML_CUDA_CC_TURING;
  }
  
+static constexpr __device__ int ggml_cuda_get_physical_warp_size() {
+#if defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+    return __AMDGCN_WAVEFRONT_SIZE;
+#else
+    return 32;
+#endif // defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)
+}
+
  [[noreturn]]
  static __device__ void no_device_code(
      const char * file_name, const int line, const char * function_name, const int arch, const char * arch_list) {
diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu

index ac45f2d17f1e1fe94ee215c675b6519b12c86a4a..5a9ddd9580ad406c9cdc79184c7f3ca5eb57ced2 100644 (file)
--- a/ggml/src/ggml-cuda/mmv.cu
+++ b/ggml/src/ggml-cuda/mmv.cu
@@ -5,9 +5,10 @@ template <typename T, typename type_acc, int block_size>
  static __global__ void mul_mat_vec(
          const T * __restrict__ x, const float * __restrict__ y, float * __restrict__ dst, const int64_t ncols2, const int64_t stride_row,
          const int64_t channel_ratio, const int64_t stride_channel_x, const int64_t stride_channel_y, const int64_t stride_channel_dst) {
-    const int64_t row     = blockIdx.x;
-    const int64_t channel = blockIdx.z;
-    const int     tid     = threadIdx.x;
+    const int64_t row       = blockIdx.x;
+    const int64_t channel   = blockIdx.z;
+    const int     tid       = threadIdx.x;
+    constexpr int warp_size = ggml_cuda_get_physical_warp_size();
  
      x   += (channel/channel_ratio)*stride_channel_x + row*stride_row;
      y   +=  channel               *stride_channel_y;
@@ -18,8 +19,8 @@ static __global__ void mul_mat_vec(
      extern __shared__ char data_mmv[];
      float * buf_iw = (float *) data_mmv;
  
-    if (block_size > WARP_SIZE) {
-        if (tid < WARP_SIZE) {
+    if (block_size > warp_size) {
+        if (tid < warp_size) {
              buf_iw[tid] = 0.0f;
          }
          __syncthreads();
@@ -67,16 +68,16 @@ static __global__ void mul_mat_vec(
          static_assert(std::is_same<T, void>::value, "unsupported type");
      }
  
-    sumf = warp_reduce_sum(sumf);
+    sumf = warp_reduce_sum<warp_size>(sumf);
  
-    if (block_size > WARP_SIZE) {
-        buf_iw[tid/WARP_SIZE] = sumf;
+    if (block_size > warp_size) {
+        buf_iw[tid/warp_size] = sumf;
          __syncthreads();
-        if (tid >= WARP_SIZE) {
+        if (tid >= warp_size) {
              return;
          }
          sumf = buf_iw[tid];
-        sumf = warp_reduce_sum(sumf);
+        sumf = warp_reduce_sum<warp_size>(sumf);
      }
  
      if (tid != 0) {
@@ -96,10 +97,19 @@ static void launch_mul_mat_vec_cuda(
      GGML_ASSERT(stride_row % 2 == 0);
      GGML_ASSERT(nchannels_y % nchannels_x == 0);
      const int64_t channel_ratio = nchannels_y / nchannels_x;
+    int device;
+    int warp_size;
  
-    int64_t block_size_best = WARP_SIZE;
-    int64_t niter_best      = (ncols + 2*WARP_SIZE - 1) / (2*WARP_SIZE);
-    for (int64_t block_size = 2*WARP_SIZE; block_size <= 256; block_size += WARP_SIZE) {
+    CUDA_CHECK(cudaGetDevice(&device));
+    warp_size = ggml_cuda_info().devices[device].warp_size;
+
+    int64_t block_size_best = warp_size;
+    int64_t niter_best      = (ncols + 2*warp_size - 1) / (2*warp_size);
+    int64_t max_block_size  = 256;
+    if(ggml_cuda_info().devices[device].cc > GGML_CUDA_CC_OFFSET_AMD && ggml_cuda_info().devices[device].cc < GGML_CUDA_CC_RDNA1) {
+        max_block_size = 128;
+    }
+    for (int64_t block_size = 2*warp_size; block_size <= max_block_size; block_size += warp_size) {
          const int64_t niter = (ncols + 2*block_size - 1) / (2*block_size);
          if (niter < niter_best) {
              niter_best      = niter;
@@ -107,7 +117,7 @@ static void launch_mul_mat_vec_cuda(
          }
      }
  
-    const int smem = WARP_SIZE*sizeof(float);
+    const int smem = warp_size*sizeof(float);
      const dim3 block_nums(nrows, 1, nchannels_y);
      const dim3 block_dims(block_size_best, 1, 1);
      switch (block_size_best) {
diff --git a/ggml/src/ggml-cuda/vendors/hip.h b/ggml/src/ggml-cuda/vendors/hip.h

index 129478ed785e7c0d0395d7a593203e30bca335c4..81964611c6064c9f5d2c1e72409635cb1f54c7a8 100644 (file)
--- a/ggml/src/ggml-cuda/vendors/hip.h
+++ b/ggml/src/ggml-cuda/vendors/hip.h
@@ -1,5 +1,6 @@
  #pragma once
  
+#define HIP_ENABLE_WARP_SYNC_BUILTINS 1
  #include <hip/hip_runtime.h>
  #include <hipblas/hipblas.h>
  #include <hip/hip_fp16.h>
@@ -8,6 +9,7 @@
  // for rocblas_initialize()
  #include "rocblas/rocblas.h"
  #endif // __HIP_PLATFORM_AMD__
+
  #define CUBLAS_COMPUTE_16F HIPBLAS_R_16F
  #define CUBLAS_COMPUTE_32F HIPBLAS_R_32F
  #define CUBLAS_COMPUTE_32F_FAST_16F HIPBLAS_R_32F
author	uvos <redacted>
	Sun, 2 Feb 2025 21:40:09 +0000 (22:40 +0100)
committer	GitHub <redacted>
	Sun, 2 Feb 2025 21:40:09 +0000 (22:40 +0100)
ggml/src/ggml-cuda/common.cuh		patch \| blob \| history
ggml/src/ggml-cuda/mmv.cu		patch \| blob \| history
ggml/src/ggml-cuda/vendors/hip.h		patch \| blob \| history