From: Johannes Gäßler Date: Thu, 19 Feb 2026 11:42:58 +0000 (+0100) Subject: CUDA: fix kernel selection logic for tile FA (#19686) X-Git-Tag: gguf-v0.18.0~69 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=c78e682245f856ab5cfc2ffc0f8c20e8e12f163f;p=pkg%2Fggml%2Fsources%2Fllama.cpp CUDA: fix kernel selection logic for tile FA (#19686) * CUDA: fix kernel selection logic for tile FA * add comment --- diff --git a/ggml/src/ggml-cuda/fattn-tile.cuh b/ggml/src/ggml-cuda/fattn-tile.cuh index b6db58228..f3fa80ab2 100644 --- a/ggml/src/ggml-cuda/fattn-tile.cuh +++ b/ggml/src/ggml-cuda/fattn-tile.cuh @@ -1186,8 +1186,10 @@ static void launch_fattn_tile_switch_ncols2(ggml_backend_cuda_context & ctx, ggm GGML_ASSERT(Q->ne[2] % K->ne[2] == 0); const int gqa_ratio = Q->ne[2] / K->ne[2]; + // On NVIDIA (Pascal and older) the GQA optimizations seem to be detrimental in some cases. + // However, for DKQ == 576, DV == 512 only the kernel variant with GQA optimizations is implemented. const bool nvidia = GGML_CUDA_CC_IS_NVIDIA(ggml_cuda_info().devices[ggml_cuda_get_device()].cc); - const int gqa_limit = nvidia && gqa_ratio <= 4 ? 16 : INT_MAX; + const int gqa_limit = nvidia && gqa_ratio <= 4 && DV <= 256 ? 16 : INT_MAX; const bool use_gqa_opt = mask && max_bias == 0.0f && Q->ne[1] <= gqa_limit && K->ne[1] % FATTN_KQ_STRIDE == 0; if constexpr (DV == 512) {