vulkan: For coopmat2 FA, use fp16 accumulators for the final result (#19376)

author Jeff Bolz <redacted>

Fri, 6 Feb 2026 08:15:13 +0000 (02:15 -0600)

committer GitHub <redacted>

Fri, 6 Feb 2026 08:15:13 +0000 (09:15 +0100)
author Jeff Bolz <redacted>
Fri, 6 Feb 2026 08:15:13 +0000 (02:15 -0600)
committer GitHub <redacted>
Fri, 6 Feb 2026 08:15:13 +0000 (09:15 +0100)
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl

index 74005cffb3f689ed4b4e00b5bec42ca1e2d30ade..4142c1e6eaac4cb0edccfce1d9c46306316d00d0 100644 (file)
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl
@@ -240,3 +240,7 @@ void init_indices()
      // and breaking the alignment detection.
      m_stride = (p.gqa_ratio > 1) ? (p.gqa_ratio >> 16) : KV;
  }
+
+// Bias applied to softmax to stay in fp16 range.
+// Based on ggml-cuda issue https://github.com/ggml-org/llama.cpp/issues/18606
+const float FATTN_KQ_MAX_OFFSET = 3.0f*0.6931f;
diff --git a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp

index b07c21f6e55e4623b9ed2b3573966b4684d2ca44..39f0c4d23b9ffa3b342fdff4f26cf414fa70374e 100644 (file)
--- a/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
+++ b/ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp
@@ -117,7 +117,7 @@ void main() {
      Qf16 = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSK_pad, gl_MatrixUseA>(Q);
      Qf16 *= float16_t(p.scale);
  
-    coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O = coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(0);
+    coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(0);
  
      coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> L, M;
  
@@ -223,6 +223,8 @@ void main() {
  
          coopMatReduceNV(rowmax, S, gl_CooperativeMatrixReduceRowNV, maxReduce);
  
+        rowmax += coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator>(FATTN_KQ_MAX_OFFSET);
+
          coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, Bc, gl_MatrixUseAccumulator> Mold = M;
  
          // M = max(rowmax, Mold)
@@ -265,11 +267,8 @@ void main() {
          // resize eM by using smear/reduce
          coopMatReduceNV(eMdiag, eM, gl_CooperativeMatrixReduceRowNV, smearReduce);
  
-        // multiply with fp16 accumulation, then add to O.
-        coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> PV = coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(0);
-        PV = coopMatMulAdd(P_A, V, PV);
-
-        O = eMdiag * O + coopmat<ACC_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(PV);
+        O *= coopmat<float16_t, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(eMdiag);
+        O = coopMatMulAdd(P_A, V, O);
      }
  
      // If there is split_k, then the split_k resolve shader does the final
@@ -311,7 +310,7 @@ void main() {
              if (sink > Mr[i]) {
                  ms = exp(Mr[i] - sink);
  
-                O[i] *= ms;
+                O[i] *= float16_t(ms);
              } else {
                  vs = exp(sink - Mr[i]);
              }
@@ -325,15 +324,16 @@ void main() {
          Ldiag[k] = (Ldiag[k] == 0.0) ? ACC_TYPE(0.0) : (ACC_TYPE(1.0) / Ldiag[k]);
      }
  
-    O = Ldiag*O;
+    coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(O);
+
+    O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(Ldiag)*O_D;
  
  #if defined(ACC_TYPE_MAX)
-    [[unroll]] for (uint i = 0; i < O.length(); ++i) { O[i] = clamp(O[i], -ACC_TYPE_MAX, ACC_TYPE_MAX); }
+    [[unroll]] for (uint i = 0; i < O_D.length(); ++i) { O_D[i] = clamp(O_D[i], D_TYPE(-ACC_TYPE_MAX), D_TYPE(ACC_TYPE_MAX)); }
  #endif
  
      uint32_t o_offset = gqa_iq1*p.ne1*HSV + iq3*p.ne2*p.ne1*HSV;
  
-    coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator> O_D = coopmat<D_TYPE, gl_ScopeWorkgroup, Br, HSV_pad, gl_MatrixUseAccumulator>(O);
      if (p.gqa_ratio > 1) {
          coopMatPerElementNV(O_D, O_D, perElemOpGqaStore, o_offset, iq2, N);
      } else {
author	Jeff Bolz <redacted>
	Fri, 6 Feb 2026 08:15:13 +0000 (02:15 -0600)
committer	GitHub <redacted>
	Fri, 6 Feb 2026 08:15:13 +0000 (09:15 +0100)
ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_base.glsl		patch \| blob \| history
ggml/src/ggml-vulkan/vulkan-shaders/flash_attn_cm2.comp		patch \| blob \| history