From: Johannes Gäßler <redacted>
Date: Sat, 3 Jan 2026 12:55:53 +0000 (+0100)
Subject: CUDA: only allocate FA tmp buffer if needed (#18564)
X-Git-Tag: upstream/0.0.7721~102
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=0f2e42ca1d1d025e6c4cb4bffb78da8972dec17f;p=pkg%2Fggml%2Fsources%2Fllama.cpp

CUDA: only allocate FA tmp buffer if needed (#18564)
---

diff --git a/ggml/src/ggml-cuda/fattn-common.cuh b/ggml/src/ggml-cuda/fattn-common.cuh
index fa4e87ee4..09c19429a 100644
--- a/ggml/src/ggml-cuda/fattn-common.cuh
+++ b/ggml/src/ggml-cuda/fattn-common.cuh
@@ -918,7 +918,9 @@ void launch_fattn(
         blocks_num.y = 1;
         blocks_num.z = 1;
 
-        dst_tmp_meta.alloc(((size_t) blocks_num.x) * ncols * (2 + DV/2));
+        if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
+            dst_tmp_meta.alloc((size_t(blocks_num.x) * ncols * (2 + DV/2)));
+        }
     } else {
         const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.