CUDA: Fix loop unrolling for BW in mul_mat_q_stream_k_fixup (#19053)

author Oliver Simons <redacted>

Tue, 3 Feb 2026 10:33:14 +0000 (11:33 +0100)

committer GitHub <redacted>

Tue, 3 Feb 2026 10:33:14 +0000 (11:33 +0100)
author Oliver Simons <redacted>
Tue, 3 Feb 2026 10:33:14 +0000 (11:33 +0100)
committer GitHub <redacted>
Tue, 3 Feb 2026 10:33:14 +0000 (11:33 +0100)
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh

index a382e6a6979f4da0c7681370171158eafb1cc92a..f80f98cda2c4cff6f30337a616b00c907ae87ae5 100644 (file)
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -3697,13 +3697,20 @@ static __global__ void mul_mat_q(
           tile_x_max_i, tile_y_max_j, kb0_start, kb0_stop);
  }
  
-
  template <ggml_type type, int mmq_x, bool need_check>
-static __global__ void mul_mat_q_stream_k_fixup(
-        const int32_t * ids_dst, const int32_t * expert_bounds, float * __restrict__ dst, const float * __restrict__ tmp_last_tile,
-        const int ncols_x, const int nrows_x, const int ncols_dst, const int stride_col_dst,
-        const int nchannels_y, const int stride_channel_dst, const int nsamples_y, const int stride_sample_dst,
-        const int ncols_max) {
+static __global__ void mul_mat_q_stream_k_fixup(const int32_t * ids_dst,
+                                                const int32_t * expert_bounds,
+                                                float * __restrict__ dst,
+                                                const float * __restrict__ tmp_last_tile,
+                                                const int    ncols_x,
+                                                const int    nrows_x,
+                                                const int    ncols_dst,
+                                                const size_t stride_col_dst,
+                                                const int    nchannels_y,
+                                                const size_t stride_channel_dst,
+                                                const int    nsamples_y,
+                                                const size_t stride_sample_dst,
+                                                const int    ncols_max) {
      constexpr int     mmq_y           = get_mmq_y_device();
      constexpr int     qk              = ggml_cuda_type_traits<type>::qk;
      constexpr int     ITER_K          = get_iter_k(type);
author	Oliver Simons <redacted>
	Tue, 3 Feb 2026 10:33:14 +0000 (11:33 +0100)
committer	GitHub <redacted>
	Tue, 3 Feb 2026 10:33:14 +0000 (11:33 +0100)