]> git.djapps.eu Git - pkg/ggml/sources/llama.cpp/commitdiff
CUDA: fix race conditions FlashAttention kernels (#13438)
authorJohannes Gäßler <redacted>
Sat, 10 May 2025 20:22:48 +0000 (22:22 +0200)
committerGitHub <redacted>
Sat, 10 May 2025 20:22:48 +0000 (22:22 +0200)
ggml/src/ggml-cuda/fattn-mma-f16.cuh
ggml/src/ggml-cuda/fattn-vec-f16.cuh

index b2f95fa3f00e69fc04472b5798af151978a8dd33..9873ea755a599767ae4e6904415daf881535f911 100644 (file)
@@ -874,6 +874,8 @@ static __device__ __forceinline__ void flash_attn_ext_f16_process_tile(
             }
         }
 
+        __syncthreads();
+
         // Write back combined meta data:
 #pragma unroll
         for (int imeta = 0; imeta < nmeta; ++imeta) {
index ef0addc1dbc70e6373ca5ab5b79626ee1574ff90..d96e392129848374f8eeee445a04d32717f86cd7 100644 (file)
@@ -168,6 +168,7 @@ static __global__ void flash_attn_vec_ext_f16(
     for (int j = 0; j < ncols; ++j) {
         KQ[j*D + tid] = -HALF_MAX_HALF;
     }
+    __syncthreads();
 
     half2 VKQ[ncols] = {{0.0f, 0.0f}};