static constexpr int J = J_;
#if defined(GGML_USE_HIP)
+#if defined(RDNA4)
+ static constexpr int ne = I * J / 32;
+ T x[ne] = {0};
+
+ static constexpr __device__ bool supported() {
+ if (I == 16 && J == 16) return true;
+ return false;
+ }
+
+ static __device__ __forceinline__ int get_i(const int l) {
+ if constexpr (I == 16 && J == 16) {
+ return 8 * (threadIdx.x / 16) + l;
+ } else {
+ NO_DEVICE_CODE;
+ return -1;
+ }
+ }
+
+ static __device__ __forceinline__ int get_j(const int l) {
+ if constexpr (I == 16 && J == 16) {
+ return threadIdx.x % 16;
+ } else {
+ NO_DEVICE_CODE;
+ return -1;
+ }
+ }
+#else
static constexpr int ne = I * J / 64;
T x[ne] = {0};
return -1;
}
}
+#endif // defined(RDNA4)
#elif __CUDA_ARCH__ == GGML_CUDA_CC_VOLTA
static constexpr int ne = I * J / 32;
T x[ne] = {0};
return -1;
}
}
+#elif defined(AMD_WMMA_AVAILABLE)
+ static constexpr int ne = I * J / 32;
+ half2 x[ne] = {{0.0f, 0.0f}};
+
+ static constexpr __device__ bool supported() {
+ if (I == 16 && J == 8) return true;
+ return false;
+ }
+
+ static __device__ __forceinline__ int get_i(const int l) {
+ if constexpr (I == 16 && J == 8) {
+ return threadIdx.x % 16;
+ } else {
+ NO_DEVICE_CODE;
+ return -1;
+ }
+ }
+
+ static __device__ __forceinline__ int get_j(const int l) {
+ if constexpr (I == 16 && J == 8) {
+ return 4 * (threadIdx.x / 16) + l;
+ } else {
+ NO_DEVICE_CODE;
+ return -1;
+ }
+ }
#else
static constexpr int ne = I * J / WARP_SIZE;
half2 x[ne] = {{0.0f, 0.0f}};
struct tile<I_, J_, nv_bfloat162> {
static constexpr int I = I_;
static constexpr int J = J_;
+
+#if defined(AMD_WMMA_AVAILABLE)
+ static constexpr int ne = I * J / 32;
+ nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
+
+ static constexpr __device__ bool supported() {
+ if (I == 16 && J == 8) return true;
+ return false;
+ }
+
+ static __device__ __forceinline__ int get_i(const int l) {
+ if constexpr (I == 16 && J == 8) {
+ return threadIdx.x % 16;
+ } else {
+ NO_DEVICE_CODE;
+ return -1;
+ }
+ }
+
+ static __device__ __forceinline__ int get_j(const int l) {
+ if constexpr (I == 16 && J == 8) {
+ return 4 * (threadIdx.x / 16) + l;
+ } else {
+ NO_DEVICE_CODE;
+ return -1;
+ }
+ }
+#else
static constexpr int ne = I * J / WARP_SIZE;
nv_bfloat162 x[ne] = {{0.0f, 0.0f}};
return -1;
}
}
+#endif // defined(AMD_WMMA_AVAILABLE)
};
template <int I, int J>
const int64_t * xs = (int64_t *) ((const int *) xs0 + (threadIdx.x % t.I) * stride + 2 * (threadIdx.x / t.I));
xi[0] = xs[0];
}
+#elif defined(AMD_WMMA_AVAILABLE)
+ ggml_cuda_memcpy_1<sizeof(t.x)>(t.x, xs0 + t.get_i(0) * stride + t.get_j(0));
#else
#pragma unroll
for (int l = 0; l < t.ne; ++l) {
: "+r"(Dxi[4]), "+r"(Dxi[5]), "+r"(Dxi[6]), "+r"(Dxi[7])
: "r"(Axi[2]), "r"(Axi[3]), "r"(Bxi[3]));
#endif // __CUDA_ARCH__ >= GGML_CUDA_CC_AMPERE
+#elif defined(AMD_WMMA_AVAILABLE)
+ using halfx8_t = __attribute__((ext_vector_type(8))) _Float16;
+ using floatx8_t = __attribute__((ext_vector_type(8))) float;
+ floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
+ const halfx8_t& a_frag = reinterpret_cast<const halfx8_t&>(A.x[0]);
+ const halfx8_t& b_frag = reinterpret_cast<const halfx8_t&>(B.x[0]);
+ acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_f16_w32_gfx12(a_frag, b_frag, acc_frag);
#else
GGML_UNUSED_VARS(D, A, B);
NO_DEVICE_CODE;
#endif // TURING_MMA_AVAILABLE
}
+ static __device__ __forceinline__ void mma(
+ tile<16, 16, float> & D, const tile<16, 8, nv_bfloat162> & A, const tile<16, 8, nv_bfloat162> & B) {
+#if defined(AMD_WMMA_AVAILABLE)
+ using bf16x8_t = __attribute__((ext_vector_type(8))) __bf16;
+ using floatx8_t = __attribute__((ext_vector_type(8))) float;
+ floatx8_t& acc_frag = reinterpret_cast<floatx8_t&>(D.x[0]);
+ const bf16x8_t& a_frag = reinterpret_cast<const bf16x8_t&>(A.x[0]);
+ const bf16x8_t& b_frag = reinterpret_cast<const bf16x8_t&>(B.x[0]);
+ acc_frag = __builtin_amdgcn_wmma_f32_16x16x16_bf16_w32_gfx12(a_frag, b_frag, acc_frag);
+#else
+ GGML_UNUSED_VARS(D, A, B);
+ NO_DEVICE_CODE;
+#endif // AMPERE_MMA_AVAILABLE
+ }
+
static __device__ __forceinline__ void mma(
tile<16, 16, int> & D, const tile<16, 8, int> & A, const tile<16, 8, int> & B) {
#if defined(AMD_MFMA_AVAILABLE)
#include "mma.cuh"
#include "common.cuh"
+#include "convert.cuh"
using namespace ggml_cuda_mma;
const int stride_col_id, const int stride_row_id,
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst) {
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+// TODO: handle this in a consistent and simpler way after AMD MFMA support has been added
+#if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
+#if defined(AMD_WMMA_AVAILABLE)
+ // Special case for tf32, just dummy mma layout as wmma doesn't support it.
+ constexpr int tile_B_I = std::is_same_v<T, float> ? 8 : 16;
+ constexpr int tile_C_J = std::is_same_v<T, float> ? 8 : 16;
+ typedef tile<16, 8, T> tile_A;
+ typedef tile<tile_B_I, 8, T> tile_B;
+ typedef tile<16, tile_C_J, float> tile_C;
+
+ constexpr bool a_supported = tile_A::supported();
+ constexpr bool b_supported = tile_B::supported();
+ constexpr bool c_supported = tile_C::supported();
+ constexpr bool supported = a_supported && b_supported && c_supported;
+#else
constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
-
- if (!I_16_supported && !I_32_supported) {
- NO_DEVICE_CODE;
- return;
- }
+ constexpr bool supported = I_16_supported || I_32_supported;
constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
typedef tile<I_preferred, 8, T> tile_A;
typedef tile<8, 8, T> tile_B;
typedef tile<I_preferred, 8, float> tile_C;
+#endif // defined(AMD_WMMA_AVAILABLE)
+ if constexpr (!supported) {
+ NO_DEVICE_CODE;
+ return;
+ }
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
constexpr int tile_k_padded = warp_size + 4;
if constexpr (!has_ids) {
const float2 tmp = j < cols_per_block ? y2[j*stride_col_y + col] : make_float2(0.0f, 0.0f);
- tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
+ tile_xy[j0*tile_k_padded + threadIdx.x] = ggml_cuda_cast<T>(tmp);
} else {
const bool valid = j < cols_per_block && (col_base + j) < ncols_dst_total && slot_map[j] >= 0;
float2 tmp = valid ? *(const float2*) &y[slot_map[j]*stride_channel_y + 2*(j*stride_col_y + col)] : make_float2(0.0f, 0.0f);
- tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
+ tile_xy[j0*tile_k_padded + threadIdx.x] = ggml_cuda_cast<T>(tmp);
}
}
} else {
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst);
NO_DEVICE_CODE;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+#endif // (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
}
//This kernel is for larger batch sizes of mul_mat_id
const int channel_ratio, const int stride_channel_x, const int stride_channel_y, const int stride_channel_dst,
const int sample_ratio, const int stride_sample_x, const int stride_sample_y, const int stride_sample_dst,
const uint3 sis1_fd, const uint3 nch_fd) {
-#if !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+// TODO: handle this in a consistent and simpler way after AMD MFMA support has been added
+#if (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
+#if defined(AMD_WMMA_AVAILABLE)
+ // Special case for tf32, just dummy mma layout as wmma doesn't support it.
+ constexpr int tile_B_I = std::is_same_v<T, float> ? 8 : 16;
+ constexpr int tile_C_J = std::is_same_v<T, float> ? 8 : 16;
+ typedef tile<16, 8, T> tile_A;
+ typedef tile<tile_B_I, 8, T> tile_B;
+ typedef tile<16, tile_C_J, float> tile_C;
+
+ constexpr bool a_supported = tile_A::supported();
+ constexpr bool b_supported = tile_B::supported();
+ constexpr bool c_supported = tile_C::supported();
+ constexpr bool supported = a_supported && b_supported && c_supported;
+#else
constexpr bool I_16_supported = tile<16, 8, T>::supported() && tile<16, 8, float>::supported();
constexpr bool I_32_supported = tile<32, 8, T>::supported() && tile<32, 8, float>::supported();
+ constexpr bool supported = I_16_supported || I_32_supported;
- if (!I_16_supported && !I_32_supported) {
- NO_DEVICE_CODE;
- return;
- }
-
- constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work butr 16 is ~1% faster.
+ constexpr int I_preferred = I_16_supported ? 16 : 32; // For Turing MMA both work but 16 is ~1% faster.
typedef tile<I_preferred, 8, T> tile_A;
typedef tile<8, 8, T> tile_B;
typedef tile<I_preferred, 8, float> tile_C;
+#endif // defined(AMD_WMMA_AVAILABLE)
+ if constexpr (!supported) {
+ NO_DEVICE_CODE;
+ return;
+ }
constexpr int warp_size = ggml_cuda_get_physical_warp_size();
constexpr int tile_k_padded = warp_size + 4;
#pragma unroll
for (int j0 = 0; j0 < tile_B::I; ++j0) {
const float2 tmp = vals_buf[curr_buf][j0];
- tile_xy[j0*tile_k_padded + threadIdx.x] = {tmp.x, tmp.y};
+ tile_xy[j0*tile_k_padded + threadIdx.x] = ggml_cuda_cast<T>(tmp);
}
if (itB + 1 < ntB) {
channel_ratio, stride_channel_x, stride_channel_y, stride_channel_dst,
sample_ratio, stride_sample_x, stride_sample_y, stride_sample_dst, sis1_fd, nch_fd);
NO_DEVICE_CODE;
-#endif // !defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)
+#endif // (!defined(GGML_USE_HIP) && !defined(GGML_USE_MUSA)) || defined(AMD_WMMA_AVAILABLE)
}
template<typename T, int cols_per_block, int nwarps>
cudaStream_t stream, const mmf_ids_data * ids_data) {
typedef tile<16, 8, T> tile_A_16;
typedef tile<32, 8, T> tile_A_32;
- typedef tile< 8, 8, T> tile_B;
+ typedef tile<16, 8, T> tile_B_16;
+ typedef tile< 8, 8, T> tile_B_8;
GGML_ASSERT(ncols_x % 2 == 0);
GGML_ASSERT(stride_row % 2 == 0);
constexpr int rows_per_block = MMF_ROWS_PER_BLOCK;
const int nbytes_shared_iter = nwarps_best * (volta_mma_available(cc) ? tile_A_32::I : tile_A_16::I) * (warp_size + 4) * 4;
- const int nbytes_shared_combine = GGML_PAD(cols_per_block, tile_B::I) * (nwarps_best*rows_per_block + 4) * 4;
+ const int nbytes_cols_per_block_pad = amd_wmma_available(cc) ? tile_B_16::I : tile_B_8::I;
+ const int nbytes_shared_combine = GGML_PAD(cols_per_block, nbytes_cols_per_block_pad) * (nwarps_best*rows_per_block + 4) * 4;
const int nbytes_shared = std::max(nbytes_shared_iter, nbytes_shared_combine);
const int nbytes_slotmap = ids ? GGML_PAD(cols_per_block, 16) * sizeof(int) : 0;
const int nbytes_shared_total = nbytes_shared + nbytes_slotmap;