metal : adjust extra size for FA buffer to avoid reallocations (llama/18545)

author Georgi Gerganov <redacted>

Fri, 2 Jan 2026 17:02:18 +0000 (19:02 +0200)

committer Georgi Gerganov <redacted>

Wed, 14 Jan 2026 07:11:59 +0000 (09:11 +0200)
author Georgi Gerganov <redacted>
Fri, 2 Jan 2026 17:02:18 +0000 (19:02 +0200)
committer Georgi Gerganov <redacted>
Wed, 14 Jan 2026 07:11:59 +0000 (09:11 +0200)
diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp

index acf2aa918479152e2884a8caa3ee41ac2f9ac1bd..a50b12b6f3b85a91f54c8f3e62ab94d0932688b9 100644 (file)
--- a/ggml/src/ggml-metal/ggml-metal-ops.cpp
+++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp
@@ -2181,7 +2181,11 @@ size_t ggml_metal_op_flash_attn_ext_extra_pad(const ggml_tensor * op) {
  
      const bool has_mask = op->src[3] != nullptr;
  
-    if (ggml_metal_op_flash_attn_ext_use_vec(op)) {
+    // note: the non-vec kernel requires more extra memory, so always reserve for it
+    GGML_ASSERT(OP_FLASH_ATTN_EXT_NCPSG >= OP_FLASH_ATTN_EXT_VEC_NCPSG);
+
+    //if (ggml_metal_op_flash_attn_ext_use_vec(op)) {
+    if (false) {
          // note: always reserve the padding space to avoid graph reallocations
          //const bool has_kvpad = ne11 % OP_FLASH_ATTN_EXT_VEC_NCPSG != 0;
          const bool has_kvpad = true;
author	Georgi Gerganov <redacted>
	Fri, 2 Jan 2026 17:02:18 +0000 (19:02 +0200)
committer	Georgi Gerganov <redacted>
	Wed, 14 Jan 2026 07:11:59 +0000 (09:11 +0200)