opencl: fix rms_norm_mul (llama/17250)

author lhez <redacted>

Sun, 16 Nov 2025 01:40:14 +0000 (17:40 -0800)

committer Georgi Gerganov <redacted>

Mon, 17 Nov 2025 19:05:46 +0000 (21:05 +0200)
author lhez <redacted>
Sun, 16 Nov 2025 01:40:14 +0000 (17:40 -0800)
committer Georgi Gerganov <redacted>
Mon, 17 Nov 2025 19:05:46 +0000 (21:05 +0200)
diff --git a/ggml/src/ggml-opencl/ggml-opencl.cpp b/ggml/src/ggml-opencl/ggml-opencl.cpp

index b0abfa3c1d40bb1399b33fce68feb3d77fd6cb9b..4cb6afe927191632081a393d5a021c5751ac9c31 100644 (file)
--- a/ggml/src/ggml-opencl/ggml-opencl.cpp
+++ b/ggml/src/ggml-opencl/ggml-opencl.cpp
@@ -5705,7 +5705,7 @@ static void ggml_opencl_op_rms_norm_fused(ggml_backend_t backend, ggml_tensor *
      CL_CHECK(clSetKernelArg(kernel, 21, sizeof(cl_ulong),      &nb2));
      CL_CHECK(clSetKernelArg(kernel, 22, sizeof(cl_ulong),      &nb3));
      CL_CHECK(clSetKernelArg(kernel, 23, sizeof(float),         &eps));
-    CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*nth/sgs, NULL));
+    CL_CHECK(clSetKernelArg(kernel, 24, sizeof(float)*sgs,     NULL));
  
      backend_ctx->enqueue_ndrange_kernel(kernel, 3, global_work_size, local_work_size, dst);
  }
diff --git a/ggml/src/ggml-opencl/kernels/rms_norm.cl b/ggml/src/ggml-opencl/kernels/rms_norm.cl

index ecd053cb4c1ce7556c57d765fa4ba3abf112b90d..4b18d17d6f8fdef2e0a5b04c825e84bea850920a 100644 (file)
--- a/ggml/src/ggml-opencl/kernels/rms_norm.cl
+++ b/ggml/src/ggml-opencl/kernels/rms_norm.cl
@@ -134,6 +134,15 @@ kernel void kernel_rms_norm_mul(
      src1 = src1 + offset1;
      dst  = dst  + offsetd;
  
+    // The size of sum is sizeof(float)*subgroup_size.
+    // Each subgroup writes its partial sum to this array.
+    // So the number of subgroups per workgroup for this kernel cannot exceed the subgroup size.
+    // This is generally true -
+    // for subgroup size 64, workgroup size should be less than 4096 (the max is usually 1024).
+    if (get_sub_group_id() == 0) {
+        sum[get_sub_group_local_id()] = 0.0f;
+    }
+
      int i03 = get_group_id(2);
      int i02 = get_group_id(1);
      int i01 = get_group_id(0);
@@ -148,24 +157,30 @@ kernel void kernel_rms_norm_mul(
          sumf += dot(x[i00], x[i00]);
      }
      sumf = sub_group_reduce_add(sumf);
+
+    barrier(CLK_LOCAL_MEM_FENCE);
+
      if (get_sub_group_local_id() == 0) {
          sum[get_sub_group_id()] = sumf;
      }
  
      barrier(CLK_LOCAL_MEM_FENCE);
  
-    for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) {
-       if (get_local_id(0) < i) {
-           sum[get_local_id(0)] += sum[get_local_id(0) + i];
-       }
-    }
-    if (get_local_id(0) == 0) {
-        sum[0] /= ne00;
-    }
+    //for (uint i = get_local_size(0) / get_max_sub_group_size() / 2; i > 0; i /= 2) {
+    //   if (get_local_id(0) < i) {
+    //       sum[get_local_id(0)] += sum[get_local_id(0) + i];
+    //   }
+    //}
+    //if (get_local_id(0) == 0) {
+    //    sum[0] /= ne00;
+    //}
  
-    barrier(CLK_LOCAL_MEM_FENCE);
+    //barrier(CLK_LOCAL_MEM_FENCE);
+
+    sumf = sum[get_sub_group_local_id()];
+    sumf = sub_group_reduce_add(sumf);
  
-    float mean  = sum[0];
+    float mean  = sumf / ne00;
      float scale = 1.0f/sqrt(mean + eps);
  
      global float4 * y = (global float4 *) (dst + i03*nb3 + i02*nb2 + i01*nb1);
author	lhez <redacted>
	Sun, 16 Nov 2025 01:40:14 +0000 (17:40 -0800)
committer	Georgi Gerganov <redacted>
	Mon, 17 Nov 2025 19:05:46 +0000 (21:05 +0200)
ggml/src/ggml-opencl/ggml-opencl.cpp		patch \| blob \| history
ggml/src/ggml-opencl/kernels/rms_norm.cl		patch \| blob \| history