From: leejet Date: Fri, 24 Oct 2025 19:39:37 +0000 (+0800) Subject: ggml: fix CUDA grid launch condition for large block_nums.y in binbcast (llama/16742) X-Git-Tag: upstream/1.8.3~422 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=5166efa7f0b81bfcb6da6cbb2d5ff4531810f0b3;p=pkg%2Fggml%2Fsources%2Fwhisper.cpp ggml: fix CUDA grid launch condition for large block_nums.y in binbcast (llama/16742) * Fix CUDA grid launch condition for large block_nums.y * add backend ops test * reduce test repetitions --- diff --git a/ggml/src/ggml-cuda/binbcast.cu b/ggml/src/ggml-cuda/binbcast.cu index 60240102..0e6d777b 100644 --- a/ggml/src/ggml-cuda/binbcast.cu +++ b/ggml/src/ggml-cuda/binbcast.cu @@ -272,7 +272,7 @@ static void launch_bin_bcast_pack(const ggml_tensor * src0, const ggml_tensor * const uint3 ne12 = init_fastdiv_values((uint32_t) cne1[2]); const uint3 ne13 = init_fastdiv_values((uint32_t) cne1[3]); - if (block_nums.z > 65535) { + if (block_nums.z > 65535 || block_nums.y > 65535) { int block_num = (ne0 * ne1 * ne2 * ne3 + block_size - 1) / block_size; const uint3 prod_012 = init_fastdiv_values((uint32_t) (ne0 * ne1 * ne2)); const uint3 prod_01 = init_fastdiv_values((uint32_t) (ne0 * ne1));