[[unroll]] for (uint cm_col = 0; cm_col < cms_per_col; cm_col++) {
coopMatStore(sums[cm_col * cms_per_row + cm_row], coopmat_stage, warp_i * TM * TN, TM, gl_CooperativeMatrixLayoutColumnMajor);
- [[unroll]] for (uint col = 0; col < BN; col += storestride) {
+ [[unroll]] for (uint col = 0; col < TN; col += storestride) {
const uint row_i = dc + cm_col * TN + col + store_c;
if (row_i >= _ne1) break;
for (int n_mats : {4, 8}) {
for (int n_used : {1, 2, 4}) {
for (bool b : {false, true}) {
- for (int n : {1, 32}) {
+ for (int n : {1, 32, 129}) {
int m = 512;
int k = 256;
test_cases.emplace_back(new test_mul_mat_id(type_a, type_b, n_mats, n_used, b, m, n, k));