// floor(log2(maxComputeWorkGroupInvocations))
uint32_t max_workgroup_size_log2 {};
- bool flash_attention_fp16;
-
bool coopmat_support;
bool coopmat_acc_f32_support {};
bool coopmat_acc_f16_support {};
}
}
-static vk_fa_pipeline_state get_fa_pipeline_state(const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool aligned, bool f32acc,
+static vk_fa_pipeline_state get_fa_pipeline_state(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool aligned, bool f32acc,
bool use_mask, bool use_mask_opt, bool use_logit_softcap) {
+ const bool old_amd_windows = device->vendor_id == VK_VENDOR_ID_AMD && device->driver_id == vk::DriverId::eAmdProprietary &&
+ (device->architecture == AMD_GCN || device->architecture == AMD_RDNA1 || device->architecture == AMD_RDNA2);
+
uint32_t flags = (use_mask_opt ? 1 : 0) |
(use_mask ? 2 : 0) |
- (use_logit_softcap ? 4 : 0);
+ (use_logit_softcap ? 4 : 0) |
+ (old_amd_windows ? 8 : 0);
const uint32_t subgroup_size = params.disable_subgroups ? 0 : params.subgroup_size;
} \
}
- if (device->flash_attention_fp16) {
+ if (device->fp16) {
CREATE_FA(GGML_TYPE_F32, f32, FA_SCALAR, )
CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, )
CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, )
device->mmvq_mode = 1;
}
- // Driver issues with older AMD GPUs on Windows, see https://github.com/ggml-org/llama.cpp/pull/19625#issuecomment-3940840613
- const bool is_amd_proprietary_gcn = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture == AMD_GCN && device->driver_id == vk::DriverId::eAmdProprietary;
- device->flash_attention_fp16 = device->fp16 && !is_amd_proprietary_gcn;
-
return device;
}
const uint32_t Br = params.block_rows;
const uint32_t Bc = params.block_cols;
- const uint32_t float_type_size = device->flash_attention_fp16 ? sizeof(ggml_fp16_t) : sizeof(float);
+ const uint32_t float_type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float);
// tmpsh is overestimated slightly
const uint32_t tmpsh = wg_size * sizeof(float);
uint32_t workgroups_y = (uint32_t)neq2;
uint32_t workgroups_z = (uint32_t)neq3;
- const bool f32acc = !ctx->device->flash_attention_fp16 || dst->op_params[3] == GGML_PREC_F32;
+ const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32;
// For scalar/coopmat1 FA, we can use the "large" size to accommodate qga.
// For coopmat2 FA, we always use the small size (which is still pretty large for gqa).
// Only use mask opt when the mask is fairly large. This hasn't been tuned extensively.
bool use_mask_opt = mask && nem1 >= 32 && nem0 * nem1 > 32768;
- vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(tuning_params, HSK, HSV, aligned, f32acc,
+ vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(ctx->device, tuning_params, HSK, HSV, aligned, f32acc,
mask != nullptr, use_mask_opt, logit_softcap != 0);
vk_pipeline pipeline = nullptr;