// are put into the template specialization without GQA optimizations.
bool use_gqa_opt = mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
for (const ggml_tensor * t : {Q, K, V, mask}) {
- if (t == nullptr) {
+ if (t == nullptr || ggml_is_quantized(t->type)) {
continue;
}
for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {
// The kernel versions without this optimization are also used for ALiBi, if there is no mask, or if the KV cache is not padded,
bool gqa_opt_applies = gqa_ratio % 2 == 0 && mask && max_bias == 0.0f && K->ne[1] % FATTN_KQ_STRIDE == 0;
for (const ggml_tensor * t : {Q, K, V, mask}) {
- if (t == nullptr) {
+ if (t == nullptr || ggml_is_quantized(t->type)) {
continue;
}
for (size_t i = 1; i < GGML_MAX_DIMS; ++i) {