CUDA: fix kernel selection logic for tile FA (llama/19686)

author Johannes Gäßler <redacted>

Thu, 19 Feb 2026 11:42:58 +0000 (12:42 +0100)

committer Georgi Gerganov <redacted>

Fri, 27 Feb 2026 18:57:58 +0000 (20:57 +0200)
author Johannes Gäßler <redacted>
Thu, 19 Feb 2026 11:42:58 +0000 (12:42 +0100)
committer Georgi Gerganov <redacted>
Fri, 27 Feb 2026 18:57:58 +0000 (20:57 +0200)
diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh

index b6db5822818bd848d82f6c56845c55d2b2fe3dad..f3fa80ab23d09b3f8f634c1cb29326e1dfac2bbd 100644 (file)
--- a/ggml/src/ggml-cuda/fattn-tile.cuh
+++ b/ggml/src/ggml-cuda/fattn-tile.cuh
@@ -1186,8 +1186,10 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm
      GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
      const int gqa_ratio = Q->ne[2] / K->ne[2];
  
+    // On NVIDIA (Pascal and older) the GQA optimizations seem to be detrimental in some cases.
+    // However, for DKQ == 576, DV == 512 only the kernel variant with GQA optimizations is implemented.
      const bool nvidia = GGML_CUDA_CC_IS_NVIDIA(ggml_cuda_info().devices[ggml_cuda_get_device()].cc);
-    const int gqa_limit = nvidia && gqa_ratio <= 4 ? 16 : INT_MAX;
+    const int gqa_limit = nvidia && gqa_ratio <= 4 && DV <= 256 ? 16 : INT_MAX;
      const bool use_gqa_opt = mask && max_bias == 0.0f && Q->ne[1] <= gqa_limit && K->ne[1] % FATTN_KQ_STRIDE == 0;
  
      if constexpr (DV == 512) {
author	Johannes Gäßler <redacted>
	Thu, 19 Feb 2026 11:42:58 +0000 (12:42 +0100)
committer	Georgi Gerganov <redacted>
	Fri, 27 Feb 2026 18:57:58 +0000 (20:57 +0200)