vulkan: improve topk perf for large k, fix overflow in unit tests (llama/17582)

author Jeff Bolz <redacted>

Sat, 29 Nov 2025 07:39:57 +0000 (01:39 -0600)

committer Georgi Gerganov <redacted>

Fri, 12 Dec 2025 15:53:12 +0000 (17:53 +0200)
author Jeff Bolz <redacted>
Sat, 29 Nov 2025 07:39:57 +0000 (01:39 -0600)
committer Georgi Gerganov <redacted>
Fri, 12 Dec 2025 15:53:12 +0000 (17:53 +0200)
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp

index 73562bc1be05a20c99b930fbe4d09ca1431a510d..f3aba8165b73e19589360766065bacdf8e40680a 100644 (file)
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -10239,7 +10239,9 @@ static void ggml_vk_topk(ggml_backend_vk_context * ctx, vk_context& subctx, cons
  
          // Prefer going as small as num_topk_pipelines - 3 for perf reasons.
          // But if K is larger, then we need a larger workgroup
-        uint32_t max_pipeline = num_topk_pipelines - 3;
+        uint32_t max_pipeline = num_topk_pipelines - 1;
+        uint32_t preferred_pipeline = std::max(num_topk_pipelines - 3, (uint32_t)log2f(float(k)) + 2);
+        max_pipeline = std::min(preferred_pipeline, max_pipeline);
          uint32_t min_pipeline = (uint32_t)log2f(float(k)) + 1;
          // require full subgroup
          min_pipeline = std::max(min_pipeline, ctx->device->subgroup_size_log2);
author	Jeff Bolz <redacted>
	Sat, 29 Nov 2025 07:39:57 +0000 (01:39 -0600)
committer	Georgi Gerganov <redacted>
	Fri, 12 Dec 2025 15:53:12 +0000 (17:53 +0200)