int unroll_count = 4;
uint unrolled_iters = num_iters & ~(unroll_count - 1);
+#if K_PER_ITER == 2
+ // If the K dimension is odd, we need lastiter==true on the last iteration
+ // so OOB is computed correctly. Skip some unrolling to make that happen.
+ if ((p.ncols & 1) != 0 &&
+ unrolled_iters == num_iters &&
+ unrolled_iters > 0) {
+ unrolled_iters -= unroll_count;
+ }
+#endif
+
uint i = 0;
while (i < unrolled_iters) {
// Manually partially unroll the loop
i++;
}
}
+
unroll_count = 2;
unrolled_iters = num_iters & ~(unroll_count - 1);
+
+#if K_PER_ITER == 2
+ if ((p.ncols & 1) != 0 &&
+ unrolled_iters == num_iters &&
+ unrolled_iters > 0) {
+ unrolled_iters -= unroll_count;
+ }
+#endif
+
while (i < unrolled_iters) {
// Manually partially unroll the loop
[[unroll]] for (uint k = 0; k < unroll_count; ++k) {
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 83, 2, 64, { 8, 1}, {4, 1}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 64, 45, 128, { 8, 1}, {4, 1}));
test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 128, 45, 64, { 8, 1}, {4, 1}));
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 193, {1, 1}, {4, 1}, {0, 2, 1, 3}));
+ test_cases.emplace_back(new test_mul_mat(GGML_TYPE_F16, GGML_TYPE_F32, 1056, 1, 67, {1, 1}, {4, 1}, {0, 2, 1, 3}));
for (auto bs : {1,2,4,8}) {
for (auto nr : {1,4}) {