CUDA: fix FA FP16 accumulator overflow for Granite (llama/18614)

author Johannes Gäßler <redacted>

Mon, 5 Jan 2026 18:51:13 +0000 (19:51 +0100)

committer Georgi Gerganov <redacted>

Wed, 14 Jan 2026 07:11:59 +0000 (09:11 +0200)
author Johannes Gäßler <redacted>
Mon, 5 Jan 2026 18:51:13 +0000 (19:51 +0100)
committer Georgi Gerganov <redacted>
Wed, 14 Jan 2026 07:11:59 +0000 (09:11 +0200)
diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh

index 09c19429ad8fa7a175076cc67c385673b3bca908..31446787287637d1d57f7a142fce82368ec38361 100644 (file)
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -11,10 +11,12 @@
  #define SOFTMAX_FTZ_THRESHOLD -20.0f                   // Softmax exp. of values smaller than this are flushed to zero to avoid NaNs.
  
  // log(2) = 0.6931, by adding this to the KQ maximum used for the softmax the numerical range representable
-//     by the VKQ accumulators is effectively being shifted up by a factor of 8.
+//     by the VKQ accumulators is effectively being shifted up by a factor of 2.
  // This reduces issues with numerical overflow but also causes larger values to be flushed to zero.
  // However, as the output from FlashAttention will usually be used as an input for a matrix multiplication this should be negligible.
-#define FATTN_KQ_MAX_OFFSET 0.6931f
+// Still, the value range should be shifted as much as necessary but as little as possible.
+// The macro on the following line shifts it by a factor of 2**3=8, as was needed to fix https://github.com/ggml-org/llama.cpp/issues/18606 .
+#define FATTN_KQ_MAX_OFFSET (3.0f*0.6931f)
  
  typedef void (* fattn_kernel_t)(
          const char * __restrict__ Q,
author	Johannes Gäßler <redacted>
	Mon, 5 Jan 2026 18:51:13 +0000 (19:51 +0100)
committer	Georgi Gerganov <redacted>
	Wed, 14 Jan 2026 07:11:59 +0000 (09:11 +0200)