ggml: fix CUDA grid launch condition for large block_nums.y in binbcast (llama/16742)

author leejet <redacted>

Fri, 24 Oct 2025 19:39:37 +0000 (03:39 +0800)

committer Georgi Gerganov <redacted>

Sun, 9 Nov 2025 21:38:03 +0000 (23:38 +0200)
author leejet <redacted>
Fri, 24 Oct 2025 19:39:37 +0000 (03:39 +0800)
committer Georgi Gerganov <redacted>
Sun, 9 Nov 2025 21:38:03 +0000 (23:38 +0200)
diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu

index 60240102741f35135533943b576e5f755fd8aae5..0e6d777b1e64a8b46cdaaa3e65f9c8c8d9028ced 100644 (file)
--- a/ggml/src/ggml-cuda/binbcast.cu
+++ b/ggml/src/ggml-cuda/binbcast.cu
@@ -272,7 +272,7 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor *
          const uint3 ne12 = init_fastdiv_values((uint32_t) cne1[2]);
          const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]);
  
-        if (block_nums.z > 65535) {
+        if (block_nums.z > 65535 || block_nums.y > 65535) {
              int         block_num  = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size;
              const uint3 prod_012    = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2));
              const uint3 prod_01     = init_fastdiv_values((uint32_t) (ne0 * ne1));
author	leejet <redacted>
	Fri, 24 Oct 2025 19:39:37 +0000 (03:39 +0800)
committer	Georgi Gerganov <redacted>
	Sun, 9 Nov 2025 21:38:03 +0000 (23:38 +0200)