ggml webgpu: Fix bug in dispatching large matrix-vector multiplication (llama/19535)

author Reese Levine <redacted>

Wed, 18 Feb 2026 23:06:29 +0000 (16:06 -0700)

committer Georgi Gerganov <redacted>

Fri, 27 Feb 2026 18:57:58 +0000 (20:57 +0200)
author Reese Levine <redacted>
Wed, 18 Feb 2026 23:06:29 +0000 (16:06 -0700)
committer Georgi Gerganov <redacted>
Fri, 27 Feb 2026 18:57:58 +0000 (20:57 +0200)
diff --git a/ggml/src/ggml-webgpu/ggml-webgpu.cpp b/ggml/src/ggml-webgpu/ggml-webgpu.cpp

index 17bb2f47126247c228fd2b54cb41be9b011715fa..b5fee480562bd1bb3fdfdde10dc39d70eb362db3 100644 (file)
--- a/ggml/src/ggml-webgpu/ggml-webgpu.cpp
+++ b/ggml/src/ggml-webgpu/ggml-webgpu.cpp
@@ -1121,7 +1121,8 @@ static webgpu_command ggml_webgpu_mul_mat(webgpu_context & ctx,
          uint32_t batches       = dst->ne[2] * dst->ne[3];
          uint32_t output_groups = CEIL_DIV(dst->ne[0], decisions->outputs_per_wg);
          uint32_t total_wg      = output_groups * batches;
-        wg_x                   = total_wg % ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension;
+        // TODO: split large sizes into multiple batches to avoid way over-provisioning workgroups
+        wg_x = std::min(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension);
          wg_y = CEIL_DIV(total_wg, ctx->global_ctx->capabilities.limits.maxComputeWorkgroupsPerDimension);
      } else if (use_fast) {
          auto decisions = static_cast<ggml_webgpu_mul_mat_shader_decisions *>(pipeline.context.get());
author	Reese Levine <redacted>
	Wed, 18 Feb 2026 23:06:29 +0000 (16:06 -0700)
committer	Georgi Gerganov <redacted>
	Fri, 27 Feb 2026 18:57:58 +0000 (20:57 +0200)