Added comments explaining thread block size selection logic based on row count and...

author Aadeshveer Singh <redacted>

Sat, 20 Dec 2025 11:28:57 +0000 (16:58 +0530)

committer Georgi Gerganov <redacted>

Wed, 31 Dec 2025 15:52:09 +0000 (17:52 +0200)
author Aadeshveer Singh <redacted>
Sat, 20 Dec 2025 11:28:57 +0000 (16:58 +0530)
committer Georgi Gerganov <redacted>
Wed, 31 Dec 2025 15:52:09 +0000 (17:52 +0200)
diff --git a/ggml/src/ggml-cuda/mean.cu b/ggml/src/ggml-cuda/mean.cu

index 347abc18660ca540156d8a9c7e7548c257022f14..691d8dcb1485ff20d8744b120a483f998f592d84 100644 (file)
--- a/ggml/src/ggml-cuda/mean.cu
+++ b/ggml/src/ggml-cuda/mean.cu
@@ -63,6 +63,9 @@ void ggml_cuda_op_mean(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
  
      const int id  = ggml_cuda_get_device();
      const int nsm = ggml_cuda_info().devices[id].nsm;
+
+    // Heuristic for block size selection to optimize occupancy.
+    // See discussion in: https://github.com/ggml-org/llama.cpp/pull/15132
      if ((nrows / nsm) < 2) {
          const dim3 block_dims(512, 1, 1);
          reduce_rows_f32</*norm=*/true><<<block_nums, block_dims, 0, stream>>>(src0_d, dst_d, ncols);
author	Aadeshveer Singh <redacted>
	Sat, 20 Dec 2025 11:28:57 +0000 (16:58 +0530)
committer	Georgi Gerganov <redacted>
	Wed, 31 Dec 2025 15:52:09 +0000 (17:52 +0200)