From: Georgi Gerganov Date: Fri, 2 Jan 2026 17:02:18 +0000 (+0200) Subject: metal : adjust extra size for FA buffer to avoid reallocations (llama/18545) X-Git-Tag: upstream/1.8.3~52 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=b99c911c49cec6d2e974a081bd935dfa62e56195;p=pkg%2Fggml%2Fsources%2Fwhisper.cpp metal : adjust extra size for FA buffer to avoid reallocations (llama/18545) --- diff --git a/ggml/src/ggml-metal/ggml-metal-ops.cpp b/ggml/src/ggml-metal/ggml-metal-ops.cpp index acf2aa91..a50b12b6 100644 --- a/ggml/src/ggml-metal/ggml-metal-ops.cpp +++ b/ggml/src/ggml-metal/ggml-metal-ops.cpp @@ -2181,7 +2181,11 @@ size_t ggml_metal_op_flash_attn_ext_extra_pad(const ggml_tensor * op) { const bool has_mask = op->src[3] != nullptr; - if (ggml_metal_op_flash_attn_ext_use_vec(op)) { + // note: the non-vec kernel requires more extra memory, so always reserve for it + GGML_ASSERT(OP_FLASH_ATTN_EXT_NCPSG >= OP_FLASH_ATTN_EXT_VEC_NCPSG); + + //if (ggml_metal_op_flash_attn_ext_use_vec(op)) { + if (false) { // note: always reserve the padding space to avoid graph reallocations //const bool has_kvpad = ne11 % OP_FLASH_ATTN_EXT_VEC_NCPSG != 0; const bool has_kvpad = true;