CUDA: fix alignment check for FA (llama/19023)

author Johannes Gäßler <redacted>

Thu, 22 Jan 2026 19:39:25 +0000 (20:39 +0100)

committer Georgi Gerganov <redacted>

Fri, 30 Jan 2026 11:49:29 +0000 (13:49 +0200)
author Johannes Gäßler <redacted>
Thu, 22 Jan 2026 19:39:25 +0000 (20:39 +0100)
committer Georgi Gerganov <redacted>
Fri, 30 Jan 2026 11:49:29 +0000 (13:49 +0200)
diff --git a/src/ggml-cuda/fattn.cu b/src/ggml-cuda/fattn.cu

index 80c3bfbc271ad381d7f6dcacf2502bcff28502ec..87f07a2f9386743c381d47bbc2ae95272ccf81a5 100644 (file)
--- a/src/ggml-cuda/fattn.cu
+++ b/src/ggml-cuda/fattn.cu
@@ -46,7 +46,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con
      //     are put into the template specialization without GQA optimizations.
      bool use_gqa_opt = mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
      for (const ggml_tensor * t : {Q, K, V, mask}) {
-        if (t == nullptr) {
+        if (t == nullptr || ggml_is_quantized(t->type)) {
              continue;
          }
          for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
@@ -236,7 +236,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const
      // The kernel versions without this optimization are also used for ALiBi, if there is no mask, or if the KV cache is not padded,
      bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
      for (const ggml_tensor * t : {Q, K, V, mask}) {
-        if (t == nullptr) {
+        if (t == nullptr || ggml_is_quantized(t->type)) {
              continue;
          }
          for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
author	Johannes Gäßler <redacted>
	Thu, 22 Jan 2026 19:39:25 +0000 (20:39 +0100)
committer	Georgi Gerganov <redacted>
	Fri, 30 Jan 2026 11:49:29 +0000 (13:49 +0200)