CUDA: fix race condition in MMQ ids_dst (llama/13294)

author Johannes Gäßler <redacted>

Sun, 4 May 2025 11:58:38 +0000 (13:58 +0200)

committer Georgi Gerganov <redacted>

Wed, 7 May 2025 18:00:32 +0000 (21:00 +0300)
author Johannes Gäßler <redacted>
Sun, 4 May 2025 11:58:38 +0000 (13:58 +0200)
committer Georgi Gerganov <redacted>
Wed, 7 May 2025 18:00:32 +0000 (21:00 +0300)
diff --git a/ggml/src/ggml-cuda/mmq.cuh b/ggml/src/ggml-cuda/mmq.cuh

index 8c93e8326e20be99382ca6df2ea96185720b87b7..fc6ce0083007a67c6d40de242e7f227c0ec8bcac 100644 (file)
--- a/ggml/src/ggml-cuda/mmq.cuh
+++ b/ggml/src/ggml-cuda/mmq.cuh
@@ -2636,6 +2636,7 @@ static __global__ void mul_mat_q(
  
          ids_dst_shared[j] = j;
      }
+    __syncthreads();
  
      // On AMD or old CUDA the performance with stream-k was worse, use conventional tiling instead:
  #if (defined(GGML_USE_HIP) && defined(__HIP_PLATFORM_AMD__)) || __CUDA_ARCH__ < GGML_CUDA_CC_VOLTA
@@ -2664,6 +2665,7 @@ static __global__ void mul_mat_q(
                  return;
              }
  
+            // __syncthreads(); // There is no previous tile that could cause a race condition.
  #pragma unroll
              for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
                  const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
@@ -2674,6 +2676,7 @@ static __global__ void mul_mat_q(
  
                  ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
              }
+            __syncthreads();
          }
  
          offset_y   += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
@@ -2740,6 +2743,7 @@ static __global__ void mul_mat_q(
                  continue;
              }
  
+            __syncthreads();
  #pragma unroll
              for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
                  const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
@@ -2750,6 +2754,7 @@ static __global__ void mul_mat_q(
  
                  ids_dst_shared[j] = ids_dst[col_low + jt*mmq_x + j];
              }
+            __syncthreads();
          }
  
          offset_y   += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
@@ -2805,6 +2810,7 @@ static __global__ void mul_mat_q(
          }
  
          // The memory layout for the fixup buffer is always contiguous, therefore reset ids:
+        __syncthreads();
  #pragma unroll
          for (int j0 = 0; j0 < mmq_x; j0 += nwarps*WARP_SIZE) {
              const int j = j0 + threadIdx.y*WARP_SIZE + threadIdx.x;
@@ -2815,6 +2821,7 @@ static __global__ void mul_mat_q(
  
              ids_dst_shared[j] = j;
          }
+        __syncthreads();
      }
  
      offset_y   += (col_low + jt*mmq_x)*(sizeof(block_q8_1_mmq)/sizeof(int));
author	Johannes Gäßler <redacted>
	Sun, 4 May 2025 11:58:38 +0000 (13:58 +0200)
committer	Georgi Gerganov <redacted>
	Wed, 7 May 2025 18:00:32 +0000 (21:00 +0300)