CUDA: tune GLM 4.7 Flash FA kernel selection logic (#19097)

author Johannes Gäßler <redacted>

Tue, 27 Jan 2026 13:28:56 +0000 (14:28 +0100)

committer GitHub <redacted>

Tue, 27 Jan 2026 13:28:56 +0000 (14:28 +0100)
author Johannes Gäßler <redacted>
Tue, 27 Jan 2026 13:28:56 +0000 (14:28 +0100)
committer GitHub <redacted>
Tue, 27 Jan 2026 13:28:56 +0000 (14:28 +0100)
diff --git a/ggml/src/ggml-cuda/fattn.cu b/ggml/src/ggml-cuda/fattn.cu

index 2f5dbd13a395e3add1acc786a974374a43e9530c..b061fdf9a249ec4dd8d969f846b3fa37f9151c7e 100644 (file)
--- a/ggml/src/ggml-cuda/fattn.cu
+++ b/ggml/src/ggml-cuda/fattn.cu
@@ -148,6 +148,10 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
              const int gqa_ratio = Q->ne[2] / K->ne[2];
              if (gqa_ratio == 20) { // GLM 4.7 Flash
                  if (cc >= GGML_CUDA_CC_BLACKWELL) {
+                    if (Q->ne[1] <= 4 && K->ne[1] >= 65536) {
+                        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
+                        break;
+                    }
                      ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst);
                      break;
                  }
@@ -161,6 +165,10 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
                  }
                  if (cc >= GGML_CUDA_CC_TURING) {
                      if (Q->ne[1] <= 4) {
+                        if (K->ne[1] <= 16384) {
+                            ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
+                            break;
+                        }
                          ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 32>(ctx, dst);
                          break;
                      }
author	Johannes Gäßler <redacted>
	Tue, 27 Jan 2026 13:28:56 +0000 (14:28 +0100)
committer	GitHub <redacted>
	Tue, 27 Jan 2026 13:28:56 +0000 (14:28 +0100)