From: Johannes Gäßler Date: Thu, 22 Jan 2026 19:39:25 +0000 (+0100) Subject: CUDA: fix alignment check for FA (llama/19023) X-Git-Tag: v0.9.6~41 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=544f15d98337dd2b5d5633af88dacdd28ce2d46a;p=pkg%2Fggml%2Fsources%2Fggml CUDA: fix alignment check for FA (llama/19023) --- diff --git a/src/ggml-cuda/fattn.cu b/src/ggml-cuda/fattn.cu index 80c3bfbc..87f07a2f 100644 --- a/src/ggml-cuda/fattn.cu +++ b/src/ggml-cuda/fattn.cu @@ -46,7 +46,7 @@ static void ggml_cuda_flash_attn_ext_mma_f16_switch_ncols2(ggml_backend_cuda_con // are put into the template specialization without GQA optimizations. bool use_gqa_opt = mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0; for (const ggml_tensor * t : {Q, K, V, mask}) { - if (t == nullptr) { + if (t == nullptr || ggml_is_quantized(t->type)) { continue; } for (size_t i = 1; i < GGML_MAX_DIMS; ++i) { @@ -236,7 +236,7 @@ static best_fattn_kernel ggml_cuda_get_best_fattn_kernel(const int device, const // The kernel versions without this optimization are also used for ALiBi, if there is no mask, or if the KV cache is not padded, bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0; for (const ggml_tensor * t : {Q, K, V, mask}) { - if (t == nullptr) { + if (t == nullptr || ggml_is_quantized(t->type)) { continue; } for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {