CUDA/HIP: optimize mmv paths taken for HIP devices (llama/14324)

author uvos <redacted>

Mon, 23 Jun 2025 23:12:56 +0000 (01:12 +0200)

committer Georgi Gerganov <redacted>

Tue, 1 Jul 2025 14:54:53 +0000 (17:54 +0300)
author uvos <redacted>
Mon, 23 Jun 2025 23:12:56 +0000 (01:12 +0200)
committer Georgi Gerganov <redacted>
Tue, 1 Jul 2025 14:54:53 +0000 (17:54 +0300)
diff --git a/ggml/src/ggml-cuda/common.cuh b/ggml/src/ggml-cuda/common.cuh

index 1369bc2d9e5e3bd04e054501b47718918765781f..f6127aeee425a1eb4c3e2f3da65ae3c9111bdc42 100644 (file)
--- a/ggml/src/ggml-cuda/common.cuh
+++ b/ggml/src/ggml-cuda/common.cuh
@@ -263,7 +263,11 @@ static bool fp16_mma_hardware_available(const int cc) {
  }
  
  static bool bf16_mma_hardware_available(const int cc) {
-    return GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE;
+    return (GGML_CUDA_CC_IS_NVIDIA(cc) && cc >= GGML_CUDA_CC_AMPERE) || GGML_CUDA_CC_IS_CDNA(cc) || cc >= GGML_CUDA_CC_RDNA3;
+}
+
+static bool fp32_mma_hardware_available(const int cc) {
+    return GGML_CUDA_CC_IS_CDNA(cc);
  }
  
  // Volta technically had FP16 tensor cores but they work very differently compared to Turing and later.
diff --git a/ggml/src/ggml-cuda/mmv.cu b/ggml/src/ggml-cuda/mmv.cu

index 1502e9d942fbcc700843d338f93deadd8e8a7d8b..e14c93516bddf3206f5823698f85f4399ffc3171 100644 (file)
--- a/ggml/src/ggml-cuda/mmv.cu
+++ b/ggml/src/ggml-cuda/mmv.cu
@@ -456,6 +456,11 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
                      return ne11 <= 4;
                  }
                  return ne11 <= 3;
+            } else if (GGML_CUDA_CC_IS_AMD(cc)) {
+                if (fp32_mma_hardware_available(cc)) {
+                    return ne11 <= 3;
+                }
+                return ne11 <= 8;
              }
              return ne11 <= 8;
          case GGML_TYPE_F16:
@@ -468,6 +473,14 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
                      return src0_small && ne11 <= 3;
                  }
                  return ne11 <= 8;
+            } else if (GGML_CUDA_CC_IS_AMD(cc)) {
+                if (fp16_mma_hardware_available(cc)) {
+                    if (GGML_CUDA_CC_IS_RDNA3(cc) || GGML_CUDA_CC_IS_RDNA4(cc)) {
+                        return ne11 <= 5;
+                    }
+                    return ne11 <= 2;
+                }
+                return ne11 <= 8;
              }
              return ne11 <= 8;
          case GGML_TYPE_BF16:
@@ -480,6 +493,11 @@ bool ggml_cuda_should_use_mmv(enum ggml_type type, int cc, const int64_t * src0_
                      return src0_small && ne11 <= 3;
                  }
                  return ne11 <= 8;
+            } else if (GGML_CUDA_CC_IS_AMD(cc)) {
+                if (bf16_mma_hardware_available(cc)) {
+                    return ne11 <= 3;
+                }
+                return ne11 <= 8;
              }
              return ne11 <= 8;
          default:
author	uvos <redacted>
	Mon, 23 Jun 2025 23:12:56 +0000 (01:12 +0200)
committer	Georgi Gerganov <redacted>
	Tue, 1 Jul 2025 14:54:53 +0000 (17:54 +0300)
ggml/src/ggml-cuda/common.cuh		patch \| blob \| history
ggml/src/ggml-cuda/mmv.cu		patch \| blob \| history