]> git.djapps.eu Git - pkg/ggml/sources/whisper.cpp/commitdiff
cuda: fix race condition in cumsum (llama/18448)
authorAman Gupta <redacted>
Mon, 29 Dec 2025 06:07:17 +0000 (14:07 +0800)
committerGeorgi Gerganov <redacted>
Wed, 31 Dec 2025 15:52:09 +0000 (17:52 +0200)
* ggml-cuda: fix race condition in cumsum

* remove unneccesary sync_threads

ggml/src/ggml-cuda/cumsum.cu

index e82171f9c2ab2d0a0b6c5a0561a586c3952c5534..3bd1394c51aff04cf2dd53e331ca3dc2f4b2d733 100644 (file)
@@ -61,7 +61,7 @@ static __global__ void cumsum_cub_kernel(
 
         // Add offset to each item and store
         T thread_offset = thread_prefix - thread_sum + block_carry;
-        #pragma unroll
+#pragma unroll
         for (int i = 0; i < UNROLL_FACTOR; i++) {
             int64_t idx = start + tid * UNROLL_FACTOR + i;
             if (idx < ne00) {
@@ -69,11 +69,12 @@ static __global__ void cumsum_cub_kernel(
             }
         }
 
+        __syncthreads();
+
         // Update carry for next tile
         if (tid == 0) {
             block_carry += block_total;
         }
-        __syncthreads();
     }
 #else
     NO_DEVICE_CODE;
@@ -175,11 +176,12 @@ static __global__ void cumsum_kernel(
             }
         }
 
+        __syncthreads();
+
         // Update carry for next chunk
         if (tid == 0) {
             *s_carry += *s_chunk_total;
         }
-        __syncthreads();
     }
 }