&& !device->coopmat_bf16_support
#endif
) {
+ const uint32_t s_warptile_wm = device->subgroup_size == 8 ? 8 : 32;
+
// use scalar tile sizes
l_warptile = { 128, 128, 128, 16, subgroup_size_8 * 2, 64, 2, 4, 4, 1, subgroup_size_8 };
m_warptile = { 128, 64, 64, 16, subgroup_size_8, 32, 2, 4, 2, 1, subgroup_size_8 };
- s_warptile = { subgroup_size_16, 32, 32, 16, 32, 32, 2, 2, 2, 1, subgroup_size_8 };
+ s_warptile = { subgroup_size_32, 32, 32, 16, s_warptile_wm, 32, 2, 2, 2, 1, subgroup_size_8 };
l_wg_denoms = {128, 128, 1 };
m_wg_denoms = { 64, 64, 1 };