From: Georgi Gerganov <redacted>
Date: Fri, 2 Jan 2026 17:02:18 +0000 (+0200)
Subject: metal : adjust extra size for FA buffer to avoid reallocations (#18545)
X-Git-Tag: upstream/0.0.7721~108
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=f38de163419d9381464b9185bca55c54207b3e43;p=pkg%2Fggml%2Fsources%2Fllama.cpp

metal : adjust extra size for FA buffer to avoid reallocations (#18545)
---

diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp
index acf2aa918..a50b12b6f 100644
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -2181,7 +2181,11 @@ size_t ggml_metal_op_flash_attn_ext_extra_pad(const ggml_tensor * op) {
 
     const bool has_mask = op->src[3] != nullptr;
 
-    if (ggml_metal_op_flash_attn_ext_use_vec(op)) {
+    // note: the non-vec kernel requires more extra memory, so always reserve for it
+    GGML_ASSERT(OP_FLASH_ATTN_EXT_NCPSG >= OP_FLASH_ATTN_EXT_VEC_NCPSG);
+
+    //if (ggml_metal_op_flash_attn_ext_use_vec(op)) {
+    if (false) {
         // note: always reserve the padding space to avoid graph reallocations
         //const bool has_kvpad = ne11 % OP_FLASH_ATTN_EXT_VEC_NCPSG != 0;
         const bool has_kvpad = true;