cuda: fix race condition in cumsum (llama/18448)

author Aman Gupta <redacted>

Mon, 29 Dec 2025 06:07:17 +0000 (14:07 +0800)

committer Georgi Gerganov <redacted>

Wed, 31 Dec 2025 15:52:09 +0000 (17:52 +0200)
author Aman Gupta <redacted>
Mon, 29 Dec 2025 06:07:17 +0000 (14:07 +0800)
committer Georgi Gerganov <redacted>
Wed, 31 Dec 2025 15:52:09 +0000 (17:52 +0200)
diff --git a/ggml/src/ggml-cuda/cumsum.cu b/ggml/src/ggml-cuda/cumsum.cu

index e82171f9c2ab2d0a0b6c5a0561a586c3952c5534..3bd1394c51aff04cf2dd53e331ca3dc2f4b2d733 100644 (file)
--- a/ggml/src/ggml-cuda/cumsum.cu
+++ b/ggml/src/ggml-cuda/cumsum.cu
@@ -61,7 +61,7 @@ static __global__ void cumsum_cub_kernel(
  
          // Add offset to each item and store
          T thread_offset = thread_prefix - thread_sum + block_carry;
-        #pragma unroll
+#pragma unroll
          for (int i = 0; i < UNROLL_FACTOR; i++) {
              int64_t idx = start + tid * UNROLL_FACTOR + i;
              if (idx < ne00) {
@@ -69,11 +69,12 @@ static __global__ void cumsum_cub_kernel(
              }
          }
  
+        __syncthreads();
+
          // Update carry for next tile
          if (tid == 0) {
              block_carry += block_total;
          }
-        __syncthreads();
      }
  #else
      NO_DEVICE_CODE;
@@ -175,11 +176,12 @@ static __global__ void cumsum_kernel(
              }
          }
  
+        __syncthreads();
+
          // Update carry for next chunk
          if (tid == 0) {
              *s_carry += *s_chunk_total;
          }
-        __syncthreads();
      }
  }
author	Aman Gupta <redacted>
	Mon, 29 Dec 2025 06:07:17 +0000 (14:07 +0800)
committer	Georgi Gerganov <redacted>
	Wed, 31 Dec 2025 15:52:09 +0000 (17:52 +0200)