const uint32_t D_lsb = D ^ (D & (D-1));
uint32_t D_split = std::min(std::min(device->subgroup_size, 8u), D_lsb / 4);
- // mask dim1 is padded to 64, we rely on this to avoid clamping mask loads
- GGML_ASSERT((GGML_KQ_MASK_PAD % rows_cols[0]) == 0);
return {wg_size, rows_cols[0], rows_cols[1], hsk, hsv, clamp, D_split};
};
if (((HSK | HSV) % 16) != 0 && path == FA_COOPMAT2) {
aligned = false;
}
- // mask dim1 is padded to 64, we rely on this to avoid clamping mask loads
- GGML_ASSERT((nem1 % GGML_KQ_MASK_PAD) == 0);
bool f32acc = path == FA_SCALAR || dst->op_params[3] == GGML_PREC_F32;
}
if ((p.mask_n_head_log2 & MASK_ENABLE_BIT) != 0) {
- tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp);
- tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
- tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1);
+ bool nem1_bounds_check = !(p.gqa_ratio > 1) && (p.nem1 % Br) != 0;
- coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
+ if (nem1_bounds_check) {
+ tensorLayoutNV<2, gl_CooperativeMatrixClampModeConstantNV> tensorLayoutM = createTensorLayoutNV(2, gl_CooperativeMatrixClampModeConstantNV);
+ tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, p.nem1, KV);
+ tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1);
- coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
+ coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
- S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
+ coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
+
+ S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
+ } else {
+ tensorLayoutNV<2, Clamp> tensorLayoutM = createTensorLayoutNV(2, Clamp);
+ // Don't clamp against nem1 when GQA is enabled
+ uint32_t m_height = p.gqa_ratio > 1 ? ~0 : p.nem1;
+ tensorLayoutM = setTensorLayoutDimensionNV(tensorLayoutM, m_height, KV);
+ tensorLayoutM = setTensorLayoutStrideNV(tensorLayoutM, m_stride, 1);
+
+ coopmat<float16_t, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> mv;
+
+ coopMatLoadTensorNV(mv, data_m, m_offset, sliceTensorLayoutNV(tensorLayoutM, i * Br, Br, j * Bc, Bc));
+
+ S += slopeMat*coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(mv);
+ }
}
// Clear padding elements to -inf, so they don't contribute to rowmax