const int nthreads = ggml_cuda_fattn_vec_get_nthreads_host(cc);
const int nwarps = nthreads / WARP_SIZE;
fattn_kernel_t fattn_kernel = flash_attn_ext_vec<D, cols_per_block, type_K, type_V, use_logit_softcap>;
- constexpr bool need_f16_K = false;
- constexpr bool need_f16_V = false;
+ const bool need_f16_K = type_K == GGML_TYPE_F16;
+ const bool need_f16_V = type_V == GGML_TYPE_F16;
constexpr size_t nbytes_shared = 0;
launch_fattn<D, cols_per_block, 1>(ctx, dst, fattn_kernel, nwarps, nbytes_shared, D, need_f16_K, need_f16_V, false);
}
void ggml_cuda_flash_attn_ext_vec_case(ggml_backend_cuda_context & ctx, ggml_tensor * dst) {
const ggml_tensor * KQV = dst;
const ggml_tensor * Q = dst->src[0];
- const ggml_tensor * K = dst->src[1];
- const ggml_tensor * V = dst->src[2];
-
- GGML_ASSERT(K->type == type_K);
- GGML_ASSERT(V->type == type_V);
float logit_softcap;
memcpy(&logit_softcap, (const float *) KQV->op_params + 2, sizeof(float));
}
}
-#define FATTN_VEC_CASE(D, type_K, type_V) \
- if (Q->ne[0] == (D) && K->type == (type_K) && V->type == (type_V)) { \
- ggml_cuda_flash_attn_ext_vec_case<D, type_K, type_V>(ctx, dst); \
- return; \
- } \
+#define FATTN_VEC_CASE(D, type_K, type_V) \
+ { \
+ const bool type_K_okay = K->type == (type_K) || (K->type == GGML_TYPE_F32 && (type_K) == GGML_TYPE_F16); \
+ const bool type_V_okay = V->type == (type_V) || (V->type == GGML_TYPE_F32 && (type_V) == GGML_TYPE_F16); \
+ if (Q->ne[0] == (D) && type_K_okay && type_V_okay) { \
+ ggml_cuda_flash_attn_ext_vec_case<D, type_K, type_V>(ctx, dst); \
+ return; \
+ } \
+ } \
#define FATTN_VEC_CASES_ALL_D(type_K, type_V) \
FATTN_VEC_CASE( 64, type_K, type_V) \
#endif // GGML_CUDA_FA_ALL_QUANTS
switch (K->type) {
+ case GGML_TYPE_F32:
case GGML_TYPE_F16:
break;
case GGML_TYPE_Q4_1:
// If Turing tensor cores available, use them:
if (turing_mma_available(cc) && K->ne[1] % FATTN_KQ_STRIDE == 0 && Q->ne[0] != 40) {
if (can_use_vector_kernel) {
- if (K->type == GGML_TYPE_F16 && V->type == GGML_TYPE_F16) {
+ if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
if (cc >= GGML_CUDA_CC_ADA_LOVELACE && Q->ne[1] == 1 && Q->ne[3] == 1 && !(gqa_ratio > 4 && K->ne[1] >= 8192)) {
return BEST_FATTN_KERNEL_VEC;
}
// If there are no tensor cores available, use the generic tile kernel:
if (can_use_vector_kernel) {
- if (K->type == GGML_TYPE_F16 && V->type == GGML_TYPE_F16) {
+ if (!ggml_is_quantized(K->type) && !ggml_is_quantized(V->type)) {
if (Q->ne[1] == 1) {
if (!gqa_opt_applies) {
return BEST_FATTN_KERNEL_VEC;