CUDA: fixed tensor cores not being used on RDNA3 (llama/4697)

author Johannes Gäßler <redacted>

Sat, 30 Dec 2023 12:52:01 +0000 (13:52 +0100)

committer Georgi Gerganov <redacted>

Wed, 3 Jan 2024 12:43:51 +0000 (14:43 +0200)
author Johannes Gäßler <redacted>
Sat, 30 Dec 2023 12:52:01 +0000 (13:52 +0100)
committer Georgi Gerganov <redacted>
Wed, 3 Jan 2024 12:43:51 +0000 (14:43 +0200)
diff --git a/ggml-cuda.cu b/ggml-cuda.cu

index 71a64ca09ec8becd6b8d21b3d2caa679b79da76d..8c2712308a45d9255e7456d0127fd3b633dcddbf 100644 (file)
--- a/ggml-cuda.cu
+++ b/ggml-cuda.cu
@@ -119,10 +119,29 @@
  #define MIN_CC_DP4A   610 // minimum compute capability for __dp4a, an intrinsic for byte-wise dot products
  #define CC_VOLTA      700
  #define CC_OFFSET_AMD 1000000
+#define CC_RDNA1      (CC_OFFSET_AMD + 1010)
  #define CC_RDNA2      (CC_OFFSET_AMD + 1030)
+#define CC_RDNA3      (CC_OFFSET_AMD + 1100)
  
  #define GGML_CUDA_MAX_NODES 8192
  
+// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
+// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
+// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
+// -  7B quantum model: +100-200 MB
+// - 13B quantum model: +200-400 MB
+//
+//#define GGML_CUDA_FORCE_MMQ
+
+// TODO: improve this to be correct for more hardware
+//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
+#if !defined(GGML_CUDA_FORCE_MMQ)
+#define CUDA_USE_TENSOR_CORES
+#endif
+
+// max batch size to use MMQ kernels when tensor cores are available
+#define MMQ_MAX_BATCH_SIZE 32
+
  #if defined(GGML_USE_HIPBLAS)
  #define __CUDA_ARCH__ 1300
  
@@ -189,23 +208,6 @@ static __device__ __forceinline__ int __dp4a(const int a, const int b, int c) {
  }
  #endif // defined(GGML_USE_HIPBLAS)
  
-// define this if you want to always fallback to MMQ kernels and not use cuBLAS for matrix multiplication
-// on modern hardware, using cuBLAS is recommended as it utilizes F16 tensor cores which are very performant
-// for large computational tasks. the drawback is that this requires some extra amount of VRAM:
-// -  7B quantum model: +100-200 MB
-// - 13B quantum model: +200-400 MB
-//
-//#define GGML_CUDA_FORCE_MMQ
-
-// TODO: improve this to be correct for more hardware
-//       for example, currently fails for GeForce GTX 1660 which is TURING arch (> VOLTA) but does not have tensor cores
-#if !defined(GGML_CUDA_FORCE_MMQ) && (!defined(GGML_USE_HIPBLAS) || defined(RDNA3))
-#define CUDA_USE_TENSOR_CORES
-#endif
-
-// max batch size to use MMQ kernels when tensor cores are available
-#define MMQ_MAX_BATCH_SIZE 32
-
  #if defined(_MSC_VER)
  #pragma warning(disable: 4244 4267) // possible loss of data
  #endif
@@ -8661,13 +8663,12 @@ static void ggml_cuda_mul_mat(const ggml_tensor * src0, const ggml_tensor * src1
      }
  
  #if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
-    const bool fp16_performance_good = true;
  
-#ifdef RDNA3
-    const bool use_mul_mat_q = false;
-#else
-    const bool use_mul_mat_q = true;
-#endif // RDNA3
+    const bool fp16_performance_good = min_compute_capability >= CC_RDNA1;
+    bool               use_mul_mat_q = ggml_is_quantized(src0->type);
+#ifdef CUDA_USE_TENSOR_CORES
+    use_mul_mat_q = use_mul_mat_q && min_compute_capability < CC_RDNA3;
+#endif // CUDA_USE_TENSOR_CORES
  
  #else
author	Johannes Gäßler <redacted>
	Sat, 30 Dec 2023 12:52:01 +0000 (13:52 +0100)
committer	Georgi Gerganov <redacted>
	Wed, 3 Jan 2024 12:43:51 +0000 (14:43 +0200)