CUDA: fix race conditions FlashAttention kernels (#13438)

author Johannes Gäßler <redacted>

Sat, 10 May 2025 20:22:48 +0000 (22:22 +0200)

committer GitHub <redacted>

Sat, 10 May 2025 20:22:48 +0000 (22:22 +0200)
author Johannes Gäßler <redacted>
Sat, 10 May 2025 20:22:48 +0000 (22:22 +0200)
committer GitHub <redacted>
Sat, 10 May 2025 20:22:48 +0000 (22:22 +0200)
diff --git a/ggml/src/ggml-cuda/fattn-mma-f16.cuh b/ggml/src/ggml-cuda/fattn-mma-f16.cuh

index b2f95fa3f00e69fc04472b5798af151978a8dd33..9873ea755a599767ae4e6904415daf881535f911 100644 (file)
--- a/ggml/src/ggml-cuda/fattn-mma-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-mma-f16.cuh
@@ -874,6 +874,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
              }
          }
  
+        __syncthreads();
+
          // Write back combined meta data:
  #pragma unroll
          for (int imeta = 0; imeta < nmeta; ++imeta) {
diff --git a/ggml/src/ggml-cuda/fattn-vec-f16.cuh b/ggml/src/ggml-cuda/fattn-vec-f16.cuh

index ef0addc1dbc70e6373ca5ab5b79626ee1574ff90..d96e392129848374f8eeee445a04d32717f86cd7 100644 (file)
--- a/ggml/src/ggml-cuda/fattn-vec-f16.cuh
+++ b/ggml/src/ggml-cuda/fattn-vec-f16.cuh
@@ -168,6 +168,7 @@ static __global__ void flash_attn_vec_ext_f16(
      for (int j = 0; j < ncols; ++j) {
          KQ[j*D + tid] = -HALF_MAX_HALF;
      }
+    __syncthreads();
  
      half2 VKQ[ncols] = {{0.0f, 0.0f}};
author	Johannes Gäßler <redacted>
	Sat, 10 May 2025 20:22:48 +0000 (22:22 +0200)
committer	GitHub <redacted>
	Sat, 10 May 2025 20:22:48 +0000 (22:22 +0200)
ggml/src/ggml-cuda/fattn-mma-f16.cuh		patch \| blob \| history
ggml/src/ggml-cuda/fattn-vec-f16.cuh		patch \| blob \| history