CUDA: tune GLM 4.7 Flash FA kernel selection logic (DGX Spark) (llama/19142)

author Georgi Gerganov <redacted>

Wed, 28 Jan 2026 07:15:11 +0000 (09:15 +0200)

committer Georgi Gerganov <redacted>

Fri, 30 Jan 2026 11:49:29 +0000 (13:49 +0200)
author Georgi Gerganov <redacted>
Wed, 28 Jan 2026 07:15:11 +0000 (09:15 +0200)
committer Georgi Gerganov <redacted>
Fri, 30 Jan 2026 11:49:29 +0000 (13:49 +0200)
diff --git a/src/ggml-cuda/common.cuh b/src/ggml-cuda/common.cuh

index 09a491a836ad58dd56bafe8431ab62bb8f95d722..3335f443aeb4106b940cdb720dabebf339ce124a 100644 (file)
--- a/src/ggml-cuda/common.cuh
+++ b/src/ggml-cuda/common.cuh
@@ -53,6 +53,7 @@
  // While BW spans CC 1000, 1100 & 1200, we are integrating Tensor Core instructions available to 1200 family, see
  // https://docs.nvidia.com/cutlass/media/docs/cpp/blackwell_functionality.html#blackwell-sm120-gemms
  #define GGML_CUDA_CC_BLACKWELL       1200
+#define GGML_CUDA_CC_DGX_SPARK       1210
  #define GGML_CUDA_CC_RUBIN           1300
  #define GGML_CUDA_CC_OFFSET_AMD      0x1000000
  #define GGML_CUDA_CC_OFFSET_MTHREADS 0x0100000
diff --git a/src/ggml-cuda/fattn.cu b/src/ggml-cuda/fattn.cu

index b061fdf9a249ec4dd8d969f846b3fa37f9151c7e..fe18ff6c7dcf2d61340449df29cf35a1445417db 100644 (file)
--- a/src/ggml-cuda/fattn.cu
+++ b/src/ggml-cuda/fattn.cu
@@ -147,6 +147,14 @@ static void ggml_cuda_flash_attn_ext_mma_f16(ggml_backend_cuda_context & ctx, gg
              GGML_ASSERT(Q->ne[2] % K->ne[2] == 0);
              const int gqa_ratio = Q->ne[2] / K->ne[2];
              if (gqa_ratio == 20) { // GLM 4.7 Flash
+                if (cc >= GGML_CUDA_CC_DGX_SPARK) {
+                    if (Q->ne[1] <= 8) {
+                        ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
+                        break;
+                    }
+                    ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 4>(ctx, dst);
+                    break;
+                }
                  if (cc >= GGML_CUDA_CC_BLACKWELL) {
                      if (Q->ne[1] <= 4 && K->ne[1] >= 65536) {
                          ggml_cuda_flash_attn_ext_mma_f16_switch_ncols1<576, 512, 16>(ctx, dst);
author	Georgi Gerganov <redacted>
	Wed, 28 Jan 2026 07:15:11 +0000 (09:15 +0200)
committer	Georgi Gerganov <redacted>
	Fri, 30 Jan 2026 11:49:29 +0000 (13:49 +0200)
src/ggml-cuda/common.cuh		patch \| blob \| history
src/ggml-cuda/fattn.cu		patch \| blob \| history