]> git.djapps.eu Git - pkg/ggml/sources/whisper.cpp/commitdiff
CUDA: only allocate FA tmp buffer if needed (llama/18564)
authorJohannes Gäßler <redacted>
Sat, 3 Jan 2026 12:55:53 +0000 (13:55 +0100)
committerGeorgi Gerganov <redacted>
Wed, 14 Jan 2026 07:11:59 +0000 (09:11 +0200)
ggml/src/ggml-cuda/fattn-common.cuh

index fa4e87ee47b267d93af0cdd4e02ca8c9356ba006..09c19429ad8fa7a175076cc67c385673b3bca908 100644 (file)
@@ -918,7 +918,9 @@ void launch_fattn(
         blocks_num.y = 1;
         blocks_num.z = 1;
 
-        dst_tmp_meta.alloc(((size_t) blocks_num.x) * ncols * (2 + DV/2));
+        if (ntiles_total % blocks_num.x != 0) { // Fixup is only needed if the SMs work on fractional tiles.
+            dst_tmp_meta.alloc((size_t(blocks_num.x) * ncols * (2 + DV/2)));
+        }
     } else {
         const int ntiles_KQ = (K->ne[1] + nbatch_fa - 1) / nbatch_fa; // Max. number of parallel blocks limited by tensor size.