From: Aman Gupta Date: Mon, 29 Dec 2025 06:07:17 +0000 (+0800) Subject: cuda: fix race condition in cumsum (llama/18448) X-Git-Tag: upstream/1.8.3~72 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=e49e88b2d81e8c5ea7f869122677845821619f90;p=pkg%2Fggml%2Fsources%2Fwhisper.cpp cuda: fix race condition in cumsum (llama/18448) * ggml-cuda: fix race condition in cumsum * remove unneccesary sync_threads --- diff --git a/ggml/src/ggml-cuda/cumsum.cu b/ggml/src/ggml-cuda/cumsum.cu index e82171f9..3bd1394c 100644 --- a/ggml/src/ggml-cuda/cumsum.cu +++ b/ggml/src/ggml-cuda/cumsum.cu @@ -61,7 +61,7 @@ static __global__ void cumsum_cub_kernel( // Add offset to each item and store T thread_offset = thread_prefix - thread_sum + block_carry; - #pragma unroll +#pragma unroll for (int i = 0; i < UNROLL_FACTOR; i++) { int64_t idx = start + tid * UNROLL_FACTOR + i; if (idx < ne00) { @@ -69,11 +69,12 @@ static __global__ void cumsum_cub_kernel( } } + __syncthreads(); + // Update carry for next tile if (tid == 0) { block_carry += block_total; } - __syncthreads(); } #else NO_DEVICE_CODE; @@ -175,11 +176,12 @@ static __global__ void cumsum_kernel( } } + __syncthreads(); + // Update carry for next chunk if (tid == 0) { *s_carry += *s_chunk_total; } - __syncthreads(); } }