metal: optimise `GGML_OP_SUM` (llama/16559)

author Sam/Samuel <redacted>

Wed, 15 Oct 2025 14:05:56 +0000 (23:05 +0900)

committer Georgi Gerganov <redacted>

Wed, 22 Oct 2025 09:58:11 +0000 (12:58 +0300)
author Sam/Samuel <redacted>
Wed, 15 Oct 2025 14:05:56 +0000 (23:05 +0900)
committer Georgi Gerganov <redacted>
Wed, 22 Oct 2025 09:58:11 +0000 (12:58 +0300)
diff --git a/ggml/src/ggml-cuda/ggml-cuda.cu b/ggml/src/ggml-cuda/ggml-cuda.cu

index a5e77672f6e95e2fdd0531a5ef39bfb86bb47951..75fd6db14c514566640472ef4627db0e036f51d7 100644 (file)
--- a/ggml/src/ggml-cuda/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda/ggml-cuda.cu
@@ -3625,9 +3625,10 @@ static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const g
          case GGML_OP_CONV_2D_DW:
          case GGML_OP_CONV_TRANSPOSE_2D:
          case GGML_OP_POOL_2D:
-        case GGML_OP_SUM:
          case GGML_OP_ACC:
              return true;
+        case GGML_OP_SUM:
+            return ggml_is_contiguous_rows(op->src[0]);
          case GGML_OP_ARGSORT:
              // TODO: Support arbitrary column width
              return op->src[0]->ne[0] <= 1024;
diff --git a/ggml/src/ggml-metal/ggml-metal-device.m b/ggml/src/ggml-metal/ggml-metal-device.m

index 553cf8f5f39ac8a6bdc58262ee2973468ce29407..c3c83abe4e63e55fd214281d0196ce73a32972a9 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal-device.m
+++ b/ggml/src/ggml-metal/ggml-metal-device.m
@@ -662,6 +662,7 @@ bool ggml_metal_device_supports_op(ggml_metal_device_t dev, const struct ggml_te
          case GGML_OP_LOG:
              return ggml_is_contiguous(op->src[0]) && op->src[0]->type == GGML_TYPE_F32;
          case GGML_OP_SUM:
+            return has_simdgroup_reduction && ggml_is_contiguous(op->src[0]);
          case GGML_OP_SUM_ROWS:
          case GGML_OP_MEAN:
          case GGML_OP_SOFT_MAX:
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp

index 784b7b77851e6873f9c8c4c97c417ceb4fc0211b..4f9f6bda00a7901f21b206f308e51d5a9b7c6652 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -866,12 +866,25 @@ int ggml_metal_op_sum(ggml_metal_op_t ctx, int idx) {
  
      ggml_metal_pipeline_t pipeline = ggml_metal_library_get_pipeline_sum(lib, op);
  
+    int nth = 32; // SIMD width
+
+    while (nth < (int) n && nth < ggml_metal_pipeline_max_theads_per_threadgroup(pipeline)) {
+        nth *= 2;
+    }
+
+    nth = std::min(nth, ggml_metal_pipeline_max_theads_per_threadgroup(pipeline));
+    nth = std::min(nth, (int) n);
+
+    const int nsg = (nth + 31) / 32;
+
      ggml_metal_encoder_set_pipeline(enc, pipeline);
      ggml_metal_encoder_set_bytes   (enc, &args, sizeof(args), 0);
      ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op->src[0]), 1);
      ggml_metal_encoder_set_buffer  (enc, ggml_metal_get_buffer_id(op),         2);
  
-    ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, 1, 1, 1);
+    ggml_metal_encoder_set_threadgroup_memory_size(enc, nsg * sizeof(float), 0);
+
+    ggml_metal_encoder_dispatch_threadgroups(enc, 1, 1, 1, nth, 1, 1);
  
      return 1;
  }
diff --git a/ggml/src/ggml-metal/ggml-metal.metal b/ggml/src/ggml-metal/ggml-metal.metal

index 6d39ddcc634ef735e379a5710b410f2a17a25022..496610b154b6d7642be0e0d151c0cc4f7af872d0 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal.metal
+++ b/ggml/src/ggml-metal/ggml-metal.metal
@@ -1727,18 +1727,48 @@ kernel void kernel_op_sum_f32(
          constant ggml_metal_kargs_sum & args,
          device const float * src0,
          device       float * dst,
-        ushort  tiitg[[thread_index_in_threadgroup]]) {
+        threadgroup  float * shmem_f32 [[threadgroup(0)]],
+        uint3   tgpig[[threadgroup_position_in_grid]],
+        ushort3 tpitg[[thread_position_in_threadgroup]],
+        ushort  sgitg[[simdgroup_index_in_threadgroup]],
+        ushort  tiisg[[thread_index_in_simdgroup]],
+        ushort3   ntg[[threads_per_threadgroup]]) {
  
-    if (tiitg != 0) {
+    if (args.np == 0) {
          return;
      }
  
-    float acc = 0.0f;
-    for (ulong i = 0; i < args.np; ++i) {
-        acc += src0[i];
+    const uint nsg = (ntg.x + 31) / 32;
+
+    float sumf = 0;
+
+    for (int64_t i0 = tpitg.x; i0 < args.np; i0 += ntg.x) {
+        sumf += src0[i0];
      }
  
-    dst[0] = acc;
+    sumf = simd_sum(sumf);
+
+    if (tiisg == 0) {
+        shmem_f32[sgitg] = sumf;
+    }
+
+    threadgroup_barrier(mem_flags::mem_threadgroup);
+
+    float total = 0;
+
+    if (sgitg == 0) {
+        float v = 0;
+
+        if (tpitg.x < nsg) {
+            v = shmem_f32[tpitg.x];
+        }
+
+        total = simd_sum(v);
+
+        if (tpitg.x == 0) {
+            dst[0] = total;
+        }
+    }
  }
  
  template <bool norm>
author	Sam/Samuel <redacted>
	Wed, 15 Oct 2025 14:05:56 +0000 (23:05 +0900)
committer	Georgi Gerganov <redacted>
	Wed, 22 Oct 2025 09:58:11 +0000 (12:58 +0300)
ggml/src/ggml-cuda/ggml-cuda.cu		patch \| blob \| history
ggml/src/ggml-metal/ggml-metal-device.m		patch \| blob \| history
ggml/src/ggml-metal/ggml-metal-ops.cpp		patch \| blob \| history
ggml/src/ggml-metal/ggml-metal.metal		patch \| blob \| history