From: yulo Date: Thu, 18 Dec 2025 11:50:56 +0000 (+0800) Subject: remove i_major_dual (llama/18157) X-Git-Tag: v0.9.5~58 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=0c4d8f6580cb041aa73af04396ac7097e7f5aeda;p=pkg%2Fggml%2Fsources%2Fggml remove i_major_dual (llama/18157) Co-authored-by: zhang hui --- diff --git a/src/ggml-cuda/mma.cuh b/src/ggml-cuda/mma.cuh index 794d90bd..3268dadf 100644 --- a/src/ggml-cuda/mma.cuh +++ b/src/ggml-cuda/mma.cuh @@ -78,27 +78,25 @@ namespace ggml_cuda_mma { // MIRRORED == Each data value is held exactly once per thread subgroup. DATA_LAYOUT_I_MAJOR = 0, // Always used for Turing, Ampere, Ada Lovelace, consumer Blackwell, matrix A&B for RDNA4 and CDNA. DATA_LAYOUT_J_MAJOR = 10, // Matrix C for CDNA and RDNA4, int and float matrix C for RDNA3. - DATA_LAYOUT_I_MAJOR_MIRRORED = 20, + DATA_LAYOUT_I_MAJOR_MIRRORED = 20, // Volta, matrix A&B for RDNA3. DATA_LAYOUT_J_MAJOR_MIRRORED = 30, - DATA_LAYOUT_I_MAJOR_DUAL = 40, // Matrix A&B for RDNA3. }; // Implemented mma combinations are: // - (I_MAJOR, I_MAJOR) -> I_MAJOR // - (I_MAJOR, I_MAJOR_MIRRORED) -> I_MAJOR // - (I_MAJOR, J_MAJOR_MIRRORED) -> I_MAJOR - constexpr bool is_i_major(const data_layout dl) { + static constexpr bool is_i_major(const data_layout dl) { return dl == DATA_LAYOUT_I_MAJOR || - dl == DATA_LAYOUT_I_MAJOR_MIRRORED || - dl == DATA_LAYOUT_I_MAJOR_DUAL; + dl == DATA_LAYOUT_I_MAJOR_MIRRORED; } - constexpr data_layout get_input_data_layout() { -#if defined(RDNA3) - return DATA_LAYOUT_I_MAJOR_DUAL; + static constexpr __device__ data_layout get_input_data_layout() { +#if defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA + return DATA_LAYOUT_I_MAJOR_MIRRORED; #else return DATA_LAYOUT_I_MAJOR; -#endif // defined(RDNA3) +#endif // defined(RDNA3) || __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA } template @@ -462,23 +460,27 @@ namespace ggml_cuda_mma { } }; - template - struct tile { + template + struct tile { static constexpr int I = I_; static constexpr int J = J_; static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED; - static constexpr int ne = I * J / (WARP_SIZE/4); - half2 x[ne] = {{0.0f, 0.0f}}; + // RDNA3 + static constexpr int ne = I * J / 32 * 2; + + T x[ne] = {0}; static constexpr __device__ bool supported() { - if (I == 8 && J == 4) return true; + if (I == 16 && J == 16) return true; + if (I == 16 && J == 8) return true; + if (I == 16 && J == 4) return true; return false; } static __device__ __forceinline__ int get_i(const int /*l*/) { - if constexpr (I == 8 && J == 4) { - return ((threadIdx.x / 16) * 4) + (threadIdx.x % 4); + if constexpr (supported()) { + return threadIdx.x % 16; } else { NO_DEVICE_CODE; return -1; @@ -486,7 +488,7 @@ namespace ggml_cuda_mma { } static __device__ __forceinline__ int get_j(const int l) { - if constexpr (I == 8 && J == 4) { + if constexpr (supported()) { return l; } else { NO_DEVICE_CODE; @@ -496,10 +498,27 @@ namespace ggml_cuda_mma { }; template - struct tile { + struct tile { static constexpr int I = I_; static constexpr int J = J_; - static constexpr data_layout dl = DATA_LAYOUT_J_MAJOR_MIRRORED; + static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED; +#if defined(RDNA3) + static constexpr int ne = tile::ne; + + half2 x[ne] = {{0.0f, 0.0f}}; + + static constexpr __device__ bool supported() { + return tile::supported(); + } + + static __device__ __forceinline__ int get_i(const int l) { + return tile::get_i(l); + } + + static __device__ __forceinline__ int get_j(const int l) { + return tile::get_j(l); + } +#else // Volta static constexpr int ne = I * J / (WARP_SIZE/4); half2 x[ne] = {{0.0f, 0.0f}}; @@ -509,9 +528,9 @@ namespace ggml_cuda_mma { return false; } - static __device__ __forceinline__ int get_i(const int l) { + static __device__ __forceinline__ int get_i(const int /*l*/) { if constexpr (I == 8 && J == 4) { - return ((l / 2) * 4) + (threadIdx.x % 4); + return ((threadIdx.x / 16) * 4) + (threadIdx.x % 4); } else { NO_DEVICE_CODE; return -1; @@ -520,34 +539,54 @@ namespace ggml_cuda_mma { static __device__ __forceinline__ int get_j(const int l) { if constexpr (I == 8 && J == 4) { - return ((threadIdx.x / 16) * 2) + (l % 2); + return l; } else { NO_DEVICE_CODE; return -1; } } +#endif // defined(RDNA3) }; - template - struct tile { + template + struct tile { static constexpr int I = I_; static constexpr int J = J_; - static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_DUAL; + static constexpr data_layout dl = DATA_LAYOUT_I_MAJOR_MIRRORED; + static constexpr int ne = tile::ne; - static constexpr int ne = I * J / 32 * 2; + nv_bfloat162 x[ne] = {{0.0f, 0.0f}}; - T x[ne] = {0}; + static constexpr __device__ bool supported() { + return tile::supported(); + } + + static __device__ __forceinline__ int get_i(const int l) { + return tile::get_i(l); + } + + static __device__ __forceinline__ int get_j(const int l) { + return tile::get_j(l); + } + }; + + template + struct tile { + static constexpr int I = I_; + static constexpr int J = J_; + static constexpr data_layout dl = DATA_LAYOUT_J_MAJOR_MIRRORED; + static constexpr int ne = I * J / (WARP_SIZE/4); + + half2 x[ne] = {{0.0f, 0.0f}}; static constexpr __device__ bool supported() { - if (I == 16 && J == 16) return true; - if (I == 16 && J == 8) return true; - if (I == 16 && J == 4) return true; + if (I == 8 && J == 4) return true; return false; } static __device__ __forceinline__ int get_i(const int l) { - if constexpr (supported()) { - return threadIdx.x % 16; + if constexpr (I == 8 && J == 4) { + return ((l / 2) * 4) + (threadIdx.x % 4); } else { NO_DEVICE_CODE; return -1; @@ -555,8 +594,8 @@ namespace ggml_cuda_mma { } static __device__ __forceinline__ int get_j(const int l) { - if constexpr (supported()) { - return l; + if constexpr (I == 8 && J == 4) { + return ((threadIdx.x / 16) * 2) + (l % 2); } else { NO_DEVICE_CODE; return -1;