From: Johannes Gäßler Date: Tue, 27 Jan 2026 13:28:56 +0000 (+0100) Subject: CUDA: tune GLM 4.7 Flash FA kernel selection logic (llama/19097) X-Git-Tag: v0.9.6~24 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=0dff0d5a9cd6ffca1b54ad17a0194e21e1e676a0;p=pkg%2Fggml%2Fsources%2Fggml CUDA: tune GLM 4.7 Flash FA kernel selection logic (llama/19097) --- diff --git a/src/ggml-cuda/fattn.cu b/src/ggml-cuda/fattn.cu index 2f5dbd13..b061fdf9 100644 --- a/src/ggml-cuda/fattn.cu +++ b/src/ggml-cuda/fattn.cu @@ -148,6 +148,10 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg const int gqa_ratio = Q->ne[2] / K->ne[2]; if (gqa_ratio == 20) { // GLM 4.7 Flash if (cc >= GGML_CUDA_CC_BLACKWELL) { + if (Q->ne[1] <= 4 && K->ne[1] >= 65536) { + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst); + break; + } ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst); break; } @@ -161,6 +165,10 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg } if (cc >= GGML_CUDA_CC_TURING) { if (Q->ne[1] <= 4) { + if (K->ne[1] <= 16384) { + ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst); + break; + } ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 32>(ctx, dst); break; }