remove i_major_dual (llama/18157)

author yulo <redacted>

Thu, 18 Dec 2025 11:50:56 +0000 (19:50 +0800)

committer Georgi Gerganov <redacted>

Wed, 31 Dec 2025 15:52:09 +0000 (17:52 +0200)
author yulo <redacted>
Thu, 18 Dec 2025 11:50:56 +0000 (19:50 +0800)
committer Georgi Gerganov <redacted>
Wed, 31 Dec 2025 15:52:09 +0000 (17:52 +0200)
diff --git a/ggml/src/ggml-cuda/mma.cuh b/ggml/src/ggml-cuda/mma.cuh

index 794d90bdd15b90481a0f460d9cc1538c8c63b2d4..3268dadfe83c3a3f272cae336dcbb512147e4a01 100644 (file)
--- a/ggml/src/ggml-cuda/mma.cuh
+++ b/ggml/src/ggml-cuda/mma.cuh
@@ -78,27 +78,25 @@ namespace ggml_cuda_mma {
          // MIRRORED == Each data value is held exactly once per thread subgroup.
          DATA_LAYOUT_I_MAJOR           =  0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell, matrix A&B for RDNA4 and CDNA.
          DATA_LAYOUT_J_MAJOR           = 10, // Matrix C for CDNA and RDNA4, int and float matrix C for RDNA3.
-        DATA_LAYOUT_I_MAJOR_MIRRORED  = 20,
+        DATA_LAYOUT_I_MAJOR_MIRRORED  = 20, // Volta, matrix A&B for RDNA3.
          DATA_LAYOUT_J_MAJOR_MIRRORED  = 30,
-        DATA_LAYOUT_I_MAJOR_DUAL      = 40, // Matrix A&B for RDNA3.
      };
      // Implemented mma combinations are:
      //   - (I_MAJOR, I_MAJOR)          -> I_MAJOR
      //   - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR
      //   - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR
  
-    constexpr bool is_i_major(const data_layout dl) {
+    static constexpr bool is_i_major(const data_layout dl) {
          return dl == DATA_LAYOUT_I_MAJOR ||
-               dl == DATA_LAYOUT_I_MAJOR_MIRRORED ||
-               dl == DATA_LAYOUT_I_MAJOR_DUAL;
+               dl == DATA_LAYOUT_I_MAJOR_MIRRORED;
      }
  
-    constexpr data_layout get_input_data_layout() {
-#if defined(RDNA3)
-        return DATA_LAYOUT_I_MAJOR_DUAL;
+    static constexpr __device__ data_layout get_input_data_layout() {
+#if defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
+        return DATA_LAYOUT_I_MAJOR_MIRRORED;
  #else
          return DATA_LAYOUT_I_MAJOR;
-#endif // defined(RDNA3)
+#endif // defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
      }
  
      template <int I_, int J_, typename T, data_layout ds_=DATA_LAYOUT_I_MAJOR>
@@ -462,23 +460,27 @@ namespace ggml_cuda_mma {
          }
      };
  
-    template <int I_, int J_>
-    struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> {
+    template <int I_, int J_, typename T>
+    struct tile<I_, J_, T, DATA_LAYOUT_I_MAJOR_MIRRORED> {
          static constexpr int         I  = I_;
          static constexpr int         J  = J_;
          static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
-        static constexpr int         ne = I * J / (WARP_SIZE/4);
  
-        half2 x[ne] = {{0.0f, 0.0f}};
+        // RDNA3
+        static constexpr int         ne = I * J / 32 * 2;
+
+        T x[ne] = {0};
  
          static constexpr __device__ bool supported() {
-            if (I ==  8 && J ==  4) return true;
+            if (I == 16 && J == 16) return true;
+            if (I == 16 && J == 8)  return true;
+            if (I == 16 && J == 4)  return true;
              return false;
          }
  
          static __device__ __forceinline__ int get_i(const int /*l*/) {
-            if constexpr (I == 8 && J == 4) {
-                return ((threadIdx.x / 16) * 4) + (threadIdx.x % 4);
+            if constexpr (supported()) {
+                return threadIdx.x % 16;
              } else {
                  NO_DEVICE_CODE;
                  return -1;
@@ -486,7 +488,7 @@ namespace ggml_cuda_mma {
          }
  
          static __device__ __forceinline__ int get_j(const int l) {
-            if constexpr (I == 8 && J == 4) {
+            if constexpr (supported()) {
                  return l;
              } else {
                  NO_DEVICE_CODE;
@@ -496,10 +498,27 @@ namespace ggml_cuda_mma {
      };
  
      template <int I_, int J_>
-    struct tile<I_, J_, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> {
+    struct tile<I_, J_, half2, DATA_LAYOUT_I_MAJOR_MIRRORED> {
          static constexpr int         I  = I_;
          static constexpr int         J  = J_;
-        static constexpr data_layout dl = DATA_LAYOUT_J_MAJOR_MIRRORED;
+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
+#if defined(RDNA3)
+        static constexpr int         ne = tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::ne;
+
+        half2 x[ne] = {{0.0f, 0.0f}};
+
+        static constexpr __device__ bool supported() {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::supported();
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_i(l);
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_j(l);
+        }
+#else // Volta
          static constexpr int         ne = I * J / (WARP_SIZE/4);
  
          half2 x[ne] = {{0.0f, 0.0f}};
@@ -509,9 +528,9 @@ namespace ggml_cuda_mma {
              return false;
          }
  
-        static __device__ __forceinline__ int get_i(const int l) {
+        static __device__ __forceinline__ int get_i(const int /*l*/) {
              if constexpr (I == 8 && J == 4) {
-                return ((l / 2) * 4) + (threadIdx.x % 4);
+                return ((threadIdx.x / 16) * 4) + (threadIdx.x % 4);
              } else {
                  NO_DEVICE_CODE;
                  return -1;
@@ -520,34 +539,54 @@ namespace ggml_cuda_mma {
  
          static __device__ __forceinline__ int get_j(const int l) {
              if constexpr (I == 8 && J == 4) {
-                return ((threadIdx.x / 16) * 2) + (l % 2);
+                return l;
              } else {
                  NO_DEVICE_CODE;
                  return -1;
              }
          }
+#endif // defined(RDNA3)
      };
  
-    template <int I_, int J_, typename T>
-    struct tile<I_, J_, T, DATA_LAYOUT_I_MAJOR_DUAL> {
+    template <int I_, int J_>
+    struct tile<I_, J_, nv_bfloat162, DATA_LAYOUT_I_MAJOR_MIRRORED> {
          static constexpr int         I  = I_;
          static constexpr int         J  = J_;
-        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL;
+        static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED;
+        static constexpr int         ne = tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::ne;
  
-        static constexpr int         ne = I * J / 32 * 2;
+        nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
  
-        T x[ne] = {0};
+        static constexpr __device__ bool supported() {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::supported();
+        }
+
+        static __device__ __forceinline__ int get_i(const int l) {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_i(l);
+        }
+
+        static __device__ __forceinline__ int get_j(const int l) {
+            return tile<I_, J_, float, DATA_LAYOUT_I_MAJOR_MIRRORED>::get_j(l);
+        }
+    };
+
+    template <int I_, int J_>
+    struct tile<I_, J_, half2, DATA_LAYOUT_J_MAJOR_MIRRORED> {
+        static constexpr int         I  = I_;
+        static constexpr int         J  = J_;
+        static constexpr data_layout dl = DATA_LAYOUT_J_MAJOR_MIRRORED;
+        static constexpr int         ne = I * J / (WARP_SIZE/4);
+
+        half2 x[ne] = {{0.0f, 0.0f}};
  
          static constexpr __device__ bool supported() {
-            if (I == 16 && J == 16) return true;
-            if (I == 16 && J == 8)  return true;
-            if (I == 16 && J == 4)  return true;
+            if (I ==  8 && J ==  4) return true;
              return false;
          }
  
          static __device__ __forceinline__ int get_i(const int l) {
-            if constexpr (supported()) {
-                return threadIdx.x % 16;
+            if constexpr (I == 8 && J == 4) {
+                return ((l / 2) * 4) + (threadIdx.x % 4);
              } else {
                  NO_DEVICE_CODE;
                  return -1;
@@ -555,8 +594,8 @@ namespace ggml_cuda_mma {
          }
  
          static __device__ __forceinline__ int get_j(const int l) {
-            if constexpr (supported()) {
-                return l;
+            if constexpr (I == 8 && J == 4) {
+                return ((threadIdx.x / 16) * 2) + (l % 2);
              } else {
                  NO_DEVICE_CODE;
                  return -1;
author	yulo <redacted>
	Thu, 18 Dec 2025 11:50:56 +0000 (19:50 +0800)
committer	Georgi Gerganov <redacted>
	Wed, 31 Dec 2025 15:52:09 +0000 (17:52 +0200)