vulkan: fix fp16 Flash Attention on Windows AMD RDNA2 and below (#19921)

author Ruben Ortlam <redacted>

Thu, 26 Feb 2026 18:11:04 +0000 (19:11 +0100)

committer GitHub <redacted>

Thu, 26 Feb 2026 18:11:04 +0000 (19:11 +0100)
author Ruben Ortlam <redacted>
Thu, 26 Feb 2026 18:11:04 +0000 (19:11 +0100)
committer GitHub <redacted>
Thu, 26 Feb 2026 18:11:04 +0000 (19:11 +0100)
diff --git a/ggml/src/ggml-vulkan/ggml-vulkan.cpp b/ggml/src/ggml-vulkan/ggml-vulkan.cpp

index a1149e606e405dc523c747586b3ff73f7e3801f7..0fae68628b67117e55fc65a51689b52b8f8438a4 100644 (file)
--- a/ggml/src/ggml-vulkan/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan/ggml-vulkan.cpp
@@ -624,8 +624,6 @@ struct vk_device_struct {
      // floor(log2(maxComputeWorkGroupInvocations))
      uint32_t max_workgroup_size_log2 {};
  
-    bool flash_attention_fp16;
-
      bool coopmat_support;
      bool coopmat_acc_f32_support {};
      bool coopmat_acc_f16_support {};
@@ -2978,11 +2976,15 @@ static vk_fa_tuning_params get_fa_tuning_params(const vk_device& device, uint32_
      }
  }
  
-static vk_fa_pipeline_state get_fa_pipeline_state(const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool aligned, bool f32acc,
+static vk_fa_pipeline_state get_fa_pipeline_state(const vk_device& device, const vk_fa_tuning_params& params, uint32_t hsk, uint32_t hsv, bool aligned, bool f32acc,
                                                    bool use_mask, bool use_mask_opt, bool use_logit_softcap) {
+    const bool old_amd_windows = device->vendor_id == VK_VENDOR_ID_AMD && device->driver_id == vk::DriverId::eAmdProprietary &&
+                                 (device->architecture == AMD_GCN || device->architecture == AMD_RDNA1 || device->architecture == AMD_RDNA2);
+
      uint32_t flags = (use_mask_opt      ? 1 : 0) |
                       (use_mask          ? 2 : 0) |
-                     (use_logit_softcap ? 4 : 0);
+                     (use_logit_softcap ? 4 : 0) |
+                     (old_amd_windows   ? 8 : 0);
  
      const uint32_t subgroup_size = params.disable_subgroups ? 0 : params.subgroup_size;
  
@@ -3384,7 +3386,7 @@ static void ggml_vk_load_shaders(vk_device& device) {
              } \
          }
  
-    if (device->flash_attention_fp16) {
+    if (device->fp16) {
          CREATE_FA(GGML_TYPE_F32, f32, FA_SCALAR, )
          CREATE_FA(GGML_TYPE_F16, f16, FA_SCALAR, )
          CREATE_FA(GGML_TYPE_Q4_0, q4_0, FA_SCALAR, )
@@ -5423,10 +5425,6 @@ static vk_device ggml_vk_get_device(size_t idx) {
              device->mmvq_mode = 1;
          }
  
-        // Driver issues with older AMD GPUs on Windows, see https://github.com/ggml-org/llama.cpp/pull/19625#issuecomment-3940840613
-        const bool is_amd_proprietary_gcn = device->vendor_id == VK_VENDOR_ID_AMD && device->architecture == AMD_GCN && device->driver_id == vk::DriverId::eAmdProprietary;
-        device->flash_attention_fp16 = device->fp16 && !is_amd_proprietary_gcn;
-
          return device;
      }
  
@@ -8567,7 +8565,7 @@ static bool ggml_vk_flash_attn_scalar_shmem_support(const vk_device& device, con
      const uint32_t Br = params.block_rows;
      const uint32_t Bc = params.block_cols;
  
-    const uint32_t float_type_size = device->flash_attention_fp16 ? sizeof(ggml_fp16_t) : sizeof(float);
+    const uint32_t float_type_size = device->fp16 ? sizeof(ggml_fp16_t) : sizeof(float);
  
      // tmpsh is overestimated slightly
      const uint32_t tmpsh = wg_size * sizeof(float);
@@ -8690,7 +8688,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
      uint32_t workgroups_y = (uint32_t)neq2;
      uint32_t workgroups_z = (uint32_t)neq3;
  
-    const bool f32acc = !ctx->device->flash_attention_fp16 || dst->op_params[3] == GGML_PREC_F32;
+    const bool f32acc = !ctx->device->fp16 || dst->op_params[3] == GGML_PREC_F32;
  
      // For scalar/coopmat1 FA, we can use the "large" size to accommodate qga.
      // For coopmat2 FA, we always use the small size (which is still pretty large for gqa).
@@ -8745,7 +8743,7 @@ static void ggml_vk_flash_attn(ggml_backend_vk_context * ctx, vk_context& subctx
  
      // Only use mask opt when the mask is fairly large. This hasn't been tuned extensively.
      bool use_mask_opt = mask && nem1 >= 32 && nem0 * nem1 > 32768;
-    vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(tuning_params, HSK, HSV, aligned, f32acc,
+    vk_fa_pipeline_state fa_pipeline_state = get_fa_pipeline_state(ctx->device, tuning_params, HSK, HSV, aligned, f32acc,
                                                                     mask != nullptr, use_mask_opt, logit_softcap != 0);
  
      vk_pipeline pipeline = nullptr;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp

index 135ab1ad625538b990ba729c014b70a20df59a40..ec48f5b11528576018e89c50f1e28cefec1974ec 100644 (file)
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp
@@ -465,7 +465,14 @@ void main() {
  
              if (SubGroupSize > 0) {
                  [[unroll]] for (uint s = D_split; s < SubGroupSize; s *= 2) {
-                    Of[r][d] += subgroupShuffleXor(Of[r][d], s);
+                    if (!OLD_AMD_WINDOWS) {
+                        Of[r][d] += subgroupShuffleXor(Of[r][d], s);
+                    } else {
+                        // Something about f16vec4 subgroupShuffleXor is broken on AMD Windows RDNA2 and below.
+                        // Shuffle full vec4 as workaround.
+                        // See https://github.com/ggml-org/llama.cpp/issues/19881#issuecomment-3958643697
+                        Of[r][d] += FLOAT_TYPEV4(subgroupShuffleXor(vec4(Of[r][d]), s));
+                    }
                  }
                  if (row_split == 1) {
                      barrier();
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl

index d444542b5336fb6923985a69c98f10fae31e4d3a..172d38f034e62a025af040a36671e250dee7b1c1 100644 (file)
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl
@@ -14,9 +14,10 @@ layout (constant_id =  9) const uint32_t SHMEM_STAGING = 0;
  layout (constant_id = 10) const uint32_t Flags = 0;
  layout (constant_id = 11) const uint32_t LIMIT_OCCUPANCY_SHMEM = 0;
  
-const bool USE_MASK_OPT  = (Flags & 1) != 0;
-const bool MASK_ENABLE   = (Flags & 2) != 0;
-const bool LOGIT_SOFTCAP = (Flags & 4) != 0;
+const bool USE_MASK_OPT    = (Flags & 1) != 0;
+const bool MASK_ENABLE     = (Flags & 2) != 0;
+const bool LOGIT_SOFTCAP   = (Flags & 4) != 0;
+const bool OLD_AMD_WINDOWS = (Flags & 8) != 0;
  
  // Round up head sizes to a multiple of 16, for coopmat1/coopmat2 paths
  const uint32_t HSK_pad = (HSK + 15) & ~15;
author	Ruben Ortlam <redacted>
	Thu, 26 Feb 2026 18:11:04 +0000 (19:11 +0100)
committer	GitHub <redacted>
	Thu, 26 Feb 2026 18:11:04 +0000 (19:11 +0100)
ggml/src/ggml-vulkan/ggml-vulkan.cpp		patch \| blob \| history
ggml/src/ggml-vulkan/vulkan-shaders/flash_attn.comp		patch \| blob \| history
ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl		patch \| blob \| history