From: uvos Date: Wed, 11 Mar 2026 05:04:32 +0000 (+0100) Subject: cuda/hip: fix loop unrolling in ssm-conv (llama/20369) X-Git-Tag: upstream/1.8.4~54 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=7c9a16c565797da6c071852204148e5711359ef7;p=pkg%2Fggml%2Fsources%2Fwhisper.cpp cuda/hip: fix loop unrolling in ssm-conv (llama/20369) --- diff --git a/ggml/src/ggml-cuda/ssm-conv.cu b/ggml/src/ggml-cuda/ssm-conv.cu index 85e82b5a..69985cd3 100644 --- a/ggml/src/ggml-cuda/ssm-conv.cu +++ b/ggml/src/ggml-cuda/ssm-conv.cu @@ -76,7 +76,7 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0, int row = tid / load_cols; int col = tid % load_cols; #pragma unroll - for (int idx = tid; idx < total_elems; idx += split_d_inner) { + for (int idx = 0; idx < total_elems; idx += split_d_inner) { if (row < (int)split_d_inner) { smem[row * n_cols + col] = x_block[row * stride_x + col]; } @@ -84,6 +84,9 @@ static __global__ void ssm_conv_long_token_f32(const float * __restrict__ src0, col += split_d_inner; row += col / load_cols; col = col % load_cols; + if (idx >= total_elems - tid - split_d_inner) { + break; + } } __syncthreads();