From: Georgi Gerganov Date: Sun, 14 Sep 2025 19:02:32 +0000 (+0300) Subject: metal : remove memory pools (#15966) X-Git-Tag: upstream/0.0.6527~54 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=9dcd200d57bc6f05a59a6a8df361d5d183af4124;p=pkg%2Fggml%2Fsources%2Fllama.cpp metal : remove memory pools (#15966) * metal : remove mem pool usage ggml-ci * metal : remove mem pool implementation ggml-ci * metal : take into account the actual allocated memory of the tensor ggml-ci * cont : use ggml_backend_buft_get_alloc_size ggml-ci * cont : improve, comments ggml-ci * cont : add functions for the extra tensor sizes * metal : add comments ggml-ci * metal : implement .get_alloc_size for the rest of the buffer types ggml-ci * metal : remove ggml_metal_heap ggml-ci --- diff --git a/ggml/src/ggml-metal/ggml-metal-common.cpp b/ggml/src/ggml-metal/ggml-metal-common.cpp index 6a869ff2..cb39e5b2 100644 --- a/ggml/src/ggml-metal/ggml-metal-common.cpp +++ b/ggml/src/ggml-metal/ggml-metal-common.cpp @@ -1,9 +1,12 @@ #include "ggml-metal-common.h" #include "ggml-impl.h" +#include "ggml-backend-impl.h" #include +// represents a memory range (i.e. an interval from a starting address p0 to an ending address p1 in a given buffer pb) +// the type indicates whether it is a source range (i.e. ops read data from it) or a destination range (i.e. ops write data to it) struct ggml_mem_range { uint64_t pb; // buffer id @@ -36,8 +39,8 @@ void ggml_mem_ranges_reset(ggml_mem_ranges * mrs) { mrs->ranges.clear(); } -static bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, ggml_mem_range mrp) { - mrs->ranges.push_back(mrp); +static bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, ggml_mem_range mr) { + mrs->ranges.push_back(mr); return true; } @@ -48,20 +51,24 @@ static ggml_mem_range ggml_mem_range_from_tensor(const ggml_tensor * tensor, ggm GGML_ASSERT(!tensor->view_src); - ggml_mem_range mrp; + ggml_mem_range mr; if (tensor->buffer) { - // when the tensor is allocated, use the actual memory address range of the buffer - mrp = { + // when the tensor is allocated, use the actual memory address range in the buffer + // + // take the actual allocated size with ggml_backend_buft_get_alloc_size() + // this can be larger than the tensor size if the buffer type allocates extra memory + // ref: https://github.com/ggml-org/llama.cpp/pull/15966 + mr = { /*.pb =*/ (uint64_t) tensor->buffer, /*.p0 =*/ (uint64_t) tensor->data, - /*.p1 =*/ (uint64_t) tensor->data + ggml_nbytes(tensor), + /*.p1 =*/ (uint64_t) tensor->data + ggml_backend_buft_get_alloc_size(tensor->buffer->buft, tensor), /*.pt =*/ pt, }; } else { - // otherwise, the tensor ptr is used as an unique id of the memory ranges + // otherwise, the pointer address is used as an unique id of the memory ranges // that the tensor will be using when it is allocated - mrp = { + mr = { /*.pb =*/ (uint64_t) tensor, /*.p0 =*/ 0, // /*.p1 =*/ 1024, // [0, 1024) is a dummy range, not used @@ -69,7 +76,7 @@ static ggml_mem_range ggml_mem_range_from_tensor(const ggml_tensor * tensor, ggm }; }; - return mrp; + return mr; } static ggml_mem_range ggml_mem_range_from_tensor_src(const ggml_tensor * tensor) { @@ -83,25 +90,25 @@ static ggml_mem_range ggml_mem_range_from_tensor_dst(const ggml_tensor * tensor) static bool ggml_mem_ranges_add_src(ggml_mem_ranges * mrs, const ggml_tensor * tensor) { GGML_ASSERT(tensor); - ggml_mem_range mrp = ggml_mem_range_from_tensor_src(tensor); + ggml_mem_range mr = ggml_mem_range_from_tensor_src(tensor); if (mrs->debug > 2) { - GGML_LOG_DEBUG("%s: add src range buf=%lld, [%lld, %lld)\n", __func__, mrp.pb, mrp.p0, mrp.p1); + GGML_LOG_DEBUG("%s: add src range buf=%lld, [%lld, %lld)\n", __func__, mr.pb, mr.p0, mr.p1); } - return ggml_mem_ranges_add(mrs, mrp); + return ggml_mem_ranges_add(mrs, mr); } static bool ggml_mem_ranges_add_dst(ggml_mem_ranges * mrs, const ggml_tensor * tensor) { GGML_ASSERT(tensor); - ggml_mem_range mrp = ggml_mem_range_from_tensor_dst(tensor); + ggml_mem_range mr = ggml_mem_range_from_tensor_dst(tensor); if (mrs->debug > 2) { - GGML_LOG_DEBUG("%s: add dst range buf=%lld, [%lld, %lld)\n", __func__, mrp.pb, mrp.p0, mrp.p1); + GGML_LOG_DEBUG("%s: add dst range buf=%lld, [%lld, %lld)\n", __func__, mr.pb, mr.p0, mr.p1); } - return ggml_mem_ranges_add(mrs, mrp); + return ggml_mem_ranges_add(mrs, mr); } bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, const ggml_tensor * tensor) { @@ -114,24 +121,26 @@ bool ggml_mem_ranges_add(ggml_mem_ranges * mrs, const ggml_tensor * tensor) { return ggml_mem_ranges_add_dst(mrs, tensor); } -static bool ggml_mem_ranges_check(const ggml_mem_ranges * mrs, ggml_mem_range mrp) { +static bool ggml_mem_ranges_check(const ggml_mem_ranges * mrs, ggml_mem_range mr) { for (size_t i = 0; i < mrs->ranges.size(); i++) { const auto & cmp = mrs->ranges[i]; - if (mrp.pb != cmp.pb) { + // two memory ranges cannot intersect if they are in different buffers + if (mr.pb != cmp.pb) { continue; } - if (mrp.pt == MEM_RANGE_TYPE_SRC && cmp.pt == MEM_RANGE_TYPE_SRC) { + // intersecting source ranges are allowed + if (mr.pt == MEM_RANGE_TYPE_SRC && cmp.pt == MEM_RANGE_TYPE_SRC) { continue; } - if (mrp.p0 < cmp.p1 && mrp.p1 >= cmp.p0) { + if (mr.p0 < cmp.p1 && mr.p1 >= cmp.p0) { if (mrs->debug > 2) { GGML_LOG_DEBUG("%s: the %s range buf=%lld, [%lld, %lld) overlaps with a previous %s range buf=%lld, [%lld, %lld)\n", __func__, - mrp.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst", - mrp.pb, mrp.p0, mrp.p1, + mr.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst", + mr.pb, mr.p0, mr.p1, cmp.pt == MEM_RANGE_TYPE_SRC ? "src" : "dst", cmp.pb, cmp.p0, cmp.p1); } @@ -146,9 +155,9 @@ static bool ggml_mem_ranges_check(const ggml_mem_ranges * mrs, ggml_mem_range mr static bool ggml_mem_ranges_check_src(const ggml_mem_ranges * mrs, const ggml_tensor * tensor) { GGML_ASSERT(tensor); - ggml_mem_range mrp = ggml_mem_range_from_tensor_src(tensor); + ggml_mem_range mr = ggml_mem_range_from_tensor_src(tensor); - const bool res = ggml_mem_ranges_check(mrs, mrp); + const bool res = ggml_mem_ranges_check(mrs, mr); return res; } @@ -156,9 +165,9 @@ static bool ggml_mem_ranges_check_src(const ggml_mem_ranges * mrs, const ggml_te static bool ggml_mem_ranges_check_dst(const ggml_mem_ranges * mrs, const ggml_tensor * tensor) { GGML_ASSERT(tensor); - ggml_mem_range mrp = ggml_mem_range_from_tensor_dst(tensor); + ggml_mem_range mr = ggml_mem_range_from_tensor_dst(tensor); - const bool res = ggml_mem_ranges_check(mrs, mrp); + const bool res = ggml_mem_ranges_check(mrs, mr); return res; } @@ -222,6 +231,7 @@ static std::vector ggml_metal_graph_optimize_reorder(const std::vectorsrc[i]) { @@ -290,7 +300,10 @@ static std::vector ggml_metal_graph_optimize_reorder(const std::vector used(n, false); + // the memory ranges for the set of currently concurrent nodes ggml_mem_ranges * mrs0 = ggml_mem_ranges_init(0); + + // the memory ranges for the set of nodes that haven't been processed yet, when looking forward for a node to reorder ggml_mem_ranges * mrs1 = ggml_mem_ranges_init(0); for (int i0 = 0; i0 < n; i0++) { @@ -329,7 +342,7 @@ static std::vector ggml_metal_graph_optimize_reorder(const std::vector order(nodes.size()); diff --git a/ggml/src/ggml-metal/ggml-metal.m b/ggml/src/ggml-metal/ggml-metal.m index 13f9de29..2243c174 100644 --- a/ggml/src/ggml-metal/ggml-metal.m +++ b/ggml/src/ggml-metal/ggml-metal.m @@ -532,261 +532,9 @@ enum ggml_metal_kernel_type { GGML_METAL_KERNEL_TYPE_COUNT }; -// -// ggml_metal_heap -// - -struct ggml_metal_heap { - // number of times the heap was unused - int n_unused; - - // total number of buffer allocations in this heap across all computes - int64_t n_alloc; - - // current offset in the heap - we reset this after each node in order to reuse the memory - size_t offs; - - // the currently allocated MTLBuffer objects in this heap - id obj; - - NSMutableArray * bufs; -}; - -static struct ggml_metal_heap * ggml_metal_heap_init(id device, size_t size) { - struct ggml_metal_heap * heap = calloc(1, sizeof(struct ggml_metal_heap)); - - MTLHeapDescriptor * desc = [[MTLHeapDescriptor alloc] init]; - desc.storageMode = MTLStorageModePrivate; - desc.cpuCacheMode = MTLCPUCacheModeDefaultCache; - desc.type = MTLHeapTypePlacement; - desc.size = size; - - heap->n_unused = 0; - heap->n_alloc = 0; - - heap->obj = [device newHeapWithDescriptor:desc]; - if (!heap->obj) { - GGML_LOG_ERROR("%s: error: failed to create MTLHeap with size %zu\n", __func__, size); - - free(heap); - - return false; - } - - [desc release]; - - heap->bufs = [[NSMutableArray alloc] init]; - - return heap; -} - -static void ggml_metal_heap_reset(struct ggml_metal_heap * heap) { - heap->offs = 0; - - // count how many graph computes the heap ended up being unused - if ([heap->bufs count] > 0) { - heap->n_unused = 0; - } else { - heap->n_unused++; - } - - for (id buf in heap->bufs) { - [buf release]; - } - [heap->bufs removeAllObjects]; - - // tell the OS that it can reuse this memory if needed - // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc - [heap->obj setPurgeableState:MTLPurgeableStateVolatile]; -} - -static void ggml_metal_heap_free(struct ggml_metal_heap * heap) { - if (heap == nil) { - return; - } - - ggml_metal_heap_reset(heap); - - [heap->obj release]; - [heap->bufs release]; - - free(heap); -} - -@interface ggml_metal_heap_ptr : NSObject - -@property (nonatomic, assign) struct ggml_metal_heap * data; - -@end - -@implementation ggml_metal_heap_ptr -@end - -// -// ggml_metal_mem_pool [TAG_MEM_POOL_REMOVE] -// - -struct ggml_metal_mem_pool { - id device; - - int n_heaps; // total number of heaps ever created (including those that were removed) - - NSMutableArray * heaps; - NSMutableArray * heaps_to_remove; -}; - -static struct ggml_metal_mem_pool * ggml_metal_mem_pool_init(void) { - struct ggml_metal_mem_pool * mem_pool = calloc(1, sizeof(struct ggml_metal_mem_pool)); - - mem_pool->n_heaps = 0; - - mem_pool->heaps = [[NSMutableArray alloc] init]; - mem_pool->heaps_to_remove = [[NSMutableArray alloc] init]; - - return mem_pool; -} - -static void ggml_metal_mem_pool_free(struct ggml_metal_mem_pool * mem_pool) { - GGML_LOG_DEBUG("%s: freeing memory pool, num heaps = %zu (total = %d)\n", __func__, [mem_pool->heaps count], mem_pool->n_heaps); - - size_t size_all = 0; - size_t size_cur = 0; - - for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { - GGML_LOG_DEBUG("%s: heap: %p\n", __func__, (void *) ptr.data); - GGML_LOG_DEBUG("%s: n_alloc: %" PRId64 "\n", __func__, ptr.data->n_alloc); - GGML_LOG_DEBUG("%s: n_unused: %d\n", __func__, ptr.data->n_unused); - GGML_LOG_DEBUG("%s: size: %.2f MiB\n", __func__, [ptr.data->obj size] / 1024.0 / 1024.0); - GGML_LOG_DEBUG("%s: bufs: %zu\n", __func__, [ptr.data->bufs count]); - - if ([ptr.data->bufs count] > 0) { - size_cur += [ptr.data->obj size]; - } - size_all += [ptr.data->obj size]; - - ggml_metal_heap_free(ptr.data); - [ptr release]; - } - [mem_pool->heaps release]; - [mem_pool->heaps_to_remove release]; - - if (size_all > 0) { - GGML_LOG_DEBUG("%s: size_all: %.2f MiB\n", __func__, size_all / 1024.0 / 1024.0); - GGML_LOG_DEBUG("%s: size_cur: %.2f MiB\n", __func__, size_cur / 1024.0 / 1024.0); - } - - free(mem_pool); -} - -static void ggml_metal_mem_pool_reset(struct ggml_metal_mem_pool * mem_pool) { - for (NSUInteger i = 0; i < [mem_pool->heaps count]; i++) { - ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:i]; - - struct ggml_metal_heap * heap = ptr.data; - ggml_metal_heap_reset(heap); - - // if the heap hasn't been used for a while, remove it - if (heap->n_unused >= 128) { - [mem_pool->heaps_to_remove addObject:@(i)]; - } - } - - if (mem_pool->heaps_to_remove.count > 0) { - // remove in reverse order - for (NSUInteger i = [mem_pool->heaps_to_remove count] - 1; ; --i) { - NSUInteger index = [[mem_pool->heaps_to_remove objectAtIndex:i] intValue]; - ggml_metal_heap_ptr * ptr = [mem_pool->heaps objectAtIndex:index]; - - struct ggml_metal_heap * heap = ptr.data; - ggml_metal_heap_free(heap); - - [mem_pool->heaps removeObjectAtIndex:index]; - [ptr release]; - - if (i == 0) { - break; - } - } - - [mem_pool->heaps_to_remove removeAllObjects]; - } -} - -static void ggml_metal_mem_pool_clear(struct ggml_metal_mem_pool * mem_pool) { - for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { - ptr.data->offs = 0; - } -} - -static id ggml_metal_mem_pool_alloc(struct ggml_metal_mem_pool * mem_pool, size_t size) { - const size_t alignment = 256; - - const size_t size_aligned = GGML_PAD(size, alignment); - - // try one of the existing heaps - for (ggml_metal_heap_ptr * ptr in mem_pool->heaps) { - struct ggml_metal_heap * heap = ptr.data; - if (heap->offs + size_aligned <= [heap->obj size]) { - // if this is the first buffer in the heap for the current command buffer, tell the OS that - // it cannot free the memory used by the heap - // ref: https://developer.apple.com/documentation/metal/mtlpurgeablestate?language=objc - if ([heap->bufs count] == 0) { - [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; - } - - id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; - if (buf == nil) { - GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned); - return nil; - } - - heap->n_alloc++; - heap->offs += size_aligned; - - [heap->bufs addObject:buf]; - - return buf; - } - } - - // create a new heap that can fit this buffer - ggml_metal_heap_ptr * heap_ptr = [ggml_metal_heap_ptr new]; - - struct ggml_metal_heap * heap = ggml_metal_heap_init(mem_pool->device, size_aligned); - if (heap == NULL) { - GGML_LOG_ERROR("%s: error: failed to create heap of size %zu\n", __func__, size_aligned); - return NULL; - } - - //GGML_LOG_DEBUG("%s: creating new heap of size %zu, got %zu\n", __func__, size_aligned, [heap->obj size]); - - heap_ptr.data = heap; - ggml_metal_heap_reset(heap); - - [heap->obj setPurgeableState:MTLPurgeableStateNonVolatile]; - id buf = [heap->obj newBufferWithLength:size_aligned options:MTLResourceStorageModePrivate offset:heap->offs]; - if (buf == nil) { - GGML_LOG_ERROR("%s: error: failed to create MTLBuffer with size %zu\n", __func__, size_aligned); - return NULL; - } - - heap->n_alloc++; - heap->offs += size_aligned; - - [heap->bufs addObject:buf]; - - [mem_pool->heaps addObject:heap_ptr]; - mem_pool->n_heaps++; - - return buf; -} - struct ggml_metal_command_buffer { id obj; - // each command buffer has a memory pool from which it can allocate temporary buffers during the compute - struct ggml_metal_mem_pool * mem_pool; - // used to enable concurrent execution of ops in the command buffers struct ggml_mem_ranges * mem_ranges; }; @@ -1103,9 +851,6 @@ static struct ggml_backend_metal_context * ggml_metal_init(ggml_backend_dev_t de for (int i = 0; i < GGML_METAL_MAX_COMMAND_BUFFERS; ++i) { ctx->cmd_bufs[i].obj = nil; - ctx->cmd_bufs[i].mem_pool = ggml_metal_mem_pool_init(); - ctx->cmd_bufs[i].mem_pool->device = device; - if (ctx_dev->use_concurrency) { ctx->cmd_bufs[i].mem_ranges = ggml_mem_ranges_init(ctx_dev->debug_graph); } @@ -1510,6 +1255,52 @@ static id ggml_metal_compile_kernel(ggml_backend_t back return res; } +// tokens per expert +static size_t ggml_metal_mul_mat_id_extra_tpe(const struct ggml_tensor * op) { + assert(op->op == GGML_OP_MUL_MAT_ID); + + const int64_t ne02 = op->src[0]->ne[2]; // n_expert + + return ggml_type_size(GGML_TYPE_I32)*ne02; +} + +// id map [n_tokens, n_expert] +static size_t ggml_metal_mul_mat_id_extra_ids(const struct ggml_tensor * op) { + assert(op->op == GGML_OP_MUL_MAT_ID); + + const int64_t ne02 = op->src[0]->ne[2]; // n_expert + const int64_t ne21 = op->src[2]->ne[1]; // n_token + + return ggml_type_size(GGML_TYPE_I32)*ne02*ne21; +} + +// return true if we should use the FA vector kernel for this op +static bool ggml_metal_flash_attn_ext_use_vec(const struct ggml_tensor * op) { + assert(op->op == GGML_OP_FLASH_ATTN_EXT); + + const int64_t ne00 = op->src[0]->ne[0]; // head size + const int64_t ne01 = op->src[0]->ne[1]; // batch size + + // use vec kernel if the batch size is small and if the head size is supported + return (ne01 < 20) && (ne00 % 32 == 0); +} + +static size_t ggml_metal_flash_attn_ext_extra_tmp(const struct ggml_tensor * op) { + assert(op->op == GGML_OP_FLASH_ATTN_EXT); + + const int64_t nwg = 32; + + const int64_t ne01 = op->src[0]->ne[1]; + const int64_t ne02 = op->src[0]->ne[2]; + const int64_t ne03 = op->src[0]->ne[3]; + const int64_t ne20 = op->src[2]->ne[0]; + + // temp buffer for writing the results from each workgroup + // - ne20: the size of the Value head + // - + 2: the S and M values for each intermediate result + return ggml_type_size(GGML_TYPE_F32)*(ne01*ne02*ne03*nwg*(ne20 + 2)); +} + static id ggml_metal_get_pipeline_flash_attn_ext( ggml_backend_t backend, struct ggml_tensor * op, bool has_mask, @@ -1760,8 +1551,6 @@ static void ggml_metal_free(struct ggml_backend_metal_context * ctx) { [ctx->cmd_bufs[i].obj release]; } - ggml_metal_mem_pool_free(ctx->cmd_bufs[i].mem_pool); - if (ctx->cmd_bufs[i].mem_ranges) { ggml_mem_ranges_free(ctx->cmd_bufs[i].mem_ranges); } @@ -2127,8 +1916,6 @@ struct ggml_metal_encode_context { id encoder; - struct ggml_metal_mem_pool * mem_pool; - struct ggml_mem_ranges * mem_ranges; }; @@ -2165,8 +1952,6 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in id encoder = ctx_enc->encoder; - struct ggml_metal_mem_pool * mem_pool = ctx_enc->mem_pool; - struct ggml_backend_metal_context * ctx = backend->context; struct ggml_backend_metal_device_context * ctx_dev = backend->device->context; @@ -2207,8 +1992,6 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in GGML_ABORT("unsupported op"); } - ggml_metal_mem_pool_clear(mem_pool); - const int64_t ne00 = src0 ? src0->ne[0] : 0; const int64_t ne01 = src0 ? src0->ne[1] : 0; const int64_t ne02 = src0 ? src0->ne[2] : 0; @@ -2522,7 +2305,6 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in /*.nb02 =*/ nb02, /*.nb11 =*/ nb11, /*.nb21 =*/ nb21, - }; [encoder setComputePipelineState:pipeline]; @@ -3167,54 +2949,8 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in const float m0 = powf(2.0f, -(max_bias ) / n_head_log2); const float m1 = powf(2.0f, -(max_bias / 2.0f) / n_head_log2); -// use this branch to test the ggml_metal_mem_pool functionality -#if 0 - // cpy to tmp buffer in MTLHeap - - id h_src0 = h_src0 = ggml_metal_mem_pool_alloc(mem_pool, ggml_nbytes(src0)); - if (!h_src0) { - GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, ggml_nbytes(src0)); - return 0; - } - - offs_src0 = 0; - - ggml_metal_kargs_cpy args_cpy = { - /*.ne00 =*/ ne00, - /*.ne01 =*/ ne01, - /*.ne02 =*/ ne02, - /*.ne03 =*/ ne03, - /*.nb00 =*/ nb00, - /*.nb01 =*/ nb01, - /*.nb02 =*/ nb02, - /*.nb03 =*/ nb03, - /*.ne0 =*/ ne00, - /*.ne1 =*/ ne01, - /*.ne2 =*/ ne02, - /*.ne3 =*/ ne03, - /*.nb0 =*/ nb00, - /*.nb1 =*/ nb01, - /*.nb2 =*/ nb02, - /*.nb3 =*/ nb03, - }; - - if (src0->type == GGML_TYPE_F16) { - [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F16_F16].pipeline]; - } else { - [encoder setComputePipelineState:ctx->kernels[GGML_METAL_KERNEL_TYPE_CPY_F32_F32].pipeline]; - } - [encoder setBytes:&args_cpy length:sizeof(args_cpy) atIndex:0]; - [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; - [encoder setBuffer:h_src0 offset:0 atIndex:2]; - - GGML_ASSERT(ne00 % ggml_blck_size(src0->type) == 0); - int nth_cpy = MIN(1024, ne00 / ggml_blck_size(src0->type)); - - [encoder dispatchThreadgroups:MTLSizeMake(ne01, ne02, ne03) threadsPerThreadgroup:MTLSizeMake(nth_cpy, 1, 1)]; - -#else id h_src0 = id_src0; -#endif + // softmax ggml_metal_kargs_soft_max args = { @@ -4093,28 +3829,9 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in default: break; } - // TODO: using mem pool allocations with enabled concurrency is not safe because the mem pool - // reuses buffers. this can result in 2 concurrent MUL_MAT_ID ops using the same mem pool buffer. - // so we add this extra barrier to prevent the race. - // the correct solution is to remove mem pools and then remove this barrier [TAG_MEM_POOL_REMOVE] - ggml_metal_encode_concurrency_reset(ctx_enc); - - // tokens per expert - const size_t s_tpe = ggml_type_size(GGML_TYPE_I32)*ne02; - id h_tpe = ggml_metal_mem_pool_alloc(mem_pool, s_tpe); - if (!h_tpe) { - GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_tpe); - return 0; - } - - // id map - // [n_tokens, n_expert] - const size_t s_ids = ggml_type_size(GGML_TYPE_I32)*ne21*ne02; - id h_ids = ggml_metal_mem_pool_alloc(mem_pool, s_ids); - if (!h_ids) { - GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_ids); - return 0; - } + // extra buffers for intermediate id mapping + size_t offs_tpe = offs_dst + ggml_nbytes(dst); + size_t offs_ids = offs_tpe + ggml_metal_mul_mat_id_extra_tpe(dst); { ggml_metal_kargs_mul_mm_id_map0 args = { @@ -4152,8 +3869,8 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in [encoder setComputePipelineState:pipeline]; [encoder setBytes:&args length:sizeof(args) atIndex:0]; [encoder setBuffer:id_src2 offset:offs_src2 atIndex:1]; - [encoder setBuffer: h_tpe offset:0 atIndex:2]; - [encoder setBuffer: h_ids offset:0 atIndex:3]; + [encoder setBuffer:id_dst offset:offs_tpe atIndex:2]; + [encoder setBuffer:id_dst offset:offs_ids atIndex:3]; [encoder setThreadgroupMemoryLength:smem atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(ne02, 1, 1)]; @@ -4215,8 +3932,8 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in [encoder setBytes:&args length:sizeof(args) atIndex:0]; [encoder setBuffer:id_src0 offset:offs_src0 atIndex:1]; [encoder setBuffer:id_src1 offset:offs_src1 atIndex:2]; - [encoder setBuffer: h_tpe offset:0 atIndex:3]; - [encoder setBuffer: h_ids offset:0 atIndex:4]; + [encoder setBuffer:id_dst offset:offs_tpe atIndex:3]; + [encoder setBuffer:id_dst offset:offs_ids atIndex:4]; [encoder setBuffer:id_dst offset:offs_dst atIndex:5]; [encoder setThreadgroupMemoryLength:8192 atIndex:0]; @@ -5306,8 +5023,7 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in GGML_ASSERT(ne01 < 65536); - // use non-vec kernel if the batch size is large or if the vec-kernel is not supported for this head size - if (ne01 >= 20 || (ne00 % 32 != 0)) { + if (!ggml_metal_flash_attn_ext_use_vec(dst)) { // half8x8 kernel const int64_t nqptg = 8; // queries per threadgroup !! sync with kernel template arguments !! const int64_t ncpsg = 64; // cache values per simdgroup !! sync with kernel template arguments !! @@ -5532,34 +5248,20 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in GGML_ASSERT(ne01*ne02*ne03 == ne1*ne2*ne3); GGML_ASSERT(ne1*ne2*ne3 <= (1u << 31)); - // using mem pool allocations with enabled concurrency is not safe [TAG_MEM_POOL_REMOVE] - // still, we assume that concurrent FA won't happen before we do the refactor - //ggml_metal_encode_concurrency_reset(ctx_enc); - - const int32_t nrows = ne1*ne2*ne3; - - // temp buffer for writing the results from each workgroup - // - ne20: the size of the head vector - // - + 2: the S and M values for each intermediate result - const size_t s_tmp = ggml_type_size(GGML_TYPE_F32)*(nrows*nwg*(ne20 + 2)); - id h_tmp = ggml_metal_mem_pool_alloc(mem_pool, s_tmp); - if (!h_tmp) { - GGML_LOG_ERROR("%s: failed to allocate buffer from memory pool, size = %zu\n", __func__, s_tmp); - return 0; - } - - //printf("ne01 = %d, ne02 = %d, ne03 = %d, ne20 = %d\n", ne01, ne02, ne03, ne20); - //printf("needed memory: %.3f MiB\n", (float) (ne01*ne02*ne03*ne20*sizeof(float))/1024.0f/1024.0f); - - [encoder setBuffer:h_tmp offset:0 atIndex:6]; + // write the results from each workgroup into a temp buffer + const size_t offs_tmp = offs_dst + ggml_nbytes(dst); + [encoder setBuffer:id_dst offset:offs_tmp atIndex:6]; [encoder setThreadgroupMemoryLength:smem atIndex:0]; [encoder dispatchThreadgroups:MTLSizeMake((ne01 + nqptg - 1)/nqptg, ne02, ne03*nwg) threadsPerThreadgroup:MTLSizeMake(32, nsg, 1)]; + // sync the 2 kernels ggml_metal_encode_concurrency_reset(ctx_enc); // reduce the results from the workgroups { + const int32_t nrows = ne1*ne2*ne3; + ggml_metal_kargs_flash_attn_ext_vec_reduce args0 = { nrows, }; @@ -5568,7 +5270,7 @@ static int ggml_metal_encode_node(struct ggml_metal_encode_context * ctx_enc, in [encoder setComputePipelineState:pipeline0]; [encoder setBytes:&args0 length:sizeof(args0) atIndex:0]; - [encoder setBuffer:h_tmp offset:0 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_tmp atIndex:1]; [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; //printf("ne1 = %d, ne2 = %d, ne3 = %d, ne20 = %d\n", ne1, ne2, ne3, ne20); @@ -5895,12 +5597,7 @@ static enum ggml_status ggml_metal_graph_compute( // the main thread commits the first few commands immediately // cmd_buf[n_cb] { - // cannot use commandBufferWithUnretainedReferences because the buffers from the memory pool can get destroyed - // TODO: when the memory pools are removed, we can again use commandBufferWithUnretainedReferences - // https://github.com/ggml-org/llama.cpp/pull/15832#discussion_r2334215009 - // [TAG_MEM_POOL_REMOVE] - //id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; - id cmd_buf = [ctx->queue commandBuffer]; + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; [cmd_buf retain]; if (ctx->cmd_bufs[n_cb].obj) { @@ -5919,8 +5616,7 @@ static enum ggml_status ggml_metal_graph_compute( // prepare the rest of the command buffers asynchronously (optional) // cmd_buf[0.. n_cb) for (int cb_idx = 0; cb_idx < n_cb; ++cb_idx) { - //id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; - id cmd_buf = [ctx->queue commandBuffer]; + id cmd_buf = [ctx->queue commandBufferWithUnretainedReferences]; [cmd_buf retain]; if (ctx->cmd_bufs[cb_idx].obj) { @@ -6377,6 +6073,31 @@ static ggml_backend_buffer_t ggml_backend_metal_buffer_type_alloc_buffer(ggml_ba return ggml_backend_buffer_init(buft, buf_i, ctx, size); } +static size_t ggml_backend_metal_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) { + size_t res = ggml_nbytes(tensor); + + // some operations require additional memory for fleeting data: + switch (tensor->op) { + case GGML_OP_MUL_MAT_ID: + { + res += ggml_metal_mul_mat_id_extra_tpe(tensor); + res += ggml_metal_mul_mat_id_extra_ids(tensor); + } break; + case GGML_OP_FLASH_ATTN_EXT: + { + if (ggml_metal_flash_attn_ext_use_vec(tensor)) { + res += ggml_metal_flash_attn_ext_extra_tmp(tensor); + } + } break; + default: + break; + } + + return res; + + GGML_UNUSED(buft); +} + // default (shared) buffer type static const char * ggml_backend_metal_buffer_type_shared_get_name(ggml_backend_buffer_type_t buft) { @@ -6401,6 +6122,10 @@ static size_t ggml_backend_metal_buffer_type_shared_get_max_size(ggml_backend_bu return max_size; } +static size_t ggml_backend_metal_buffer_type_shared_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) { + return ggml_backend_metal_buffer_type_get_alloc_size(buft, tensor); +} + static bool ggml_backend_metal_buffer_type_shared_is_host(ggml_backend_buffer_type_t buft) { return false; @@ -6414,7 +6139,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_shared(void) { /* .alloc_buffer = */ ggml_backend_metal_buffer_type_shared_alloc_buffer, /* .get_alignment = */ ggml_backend_metal_buffer_type_shared_get_alignment, /* .get_max_size = */ ggml_backend_metal_buffer_type_shared_get_max_size, - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .get_alloc_size = */ ggml_backend_metal_buffer_type_shared_get_alloc_size, /* .is_host = */ ggml_backend_metal_buffer_type_shared_is_host, }, /* .device = */ &g_ggml_backend_metal_device, @@ -6448,6 +6173,10 @@ static size_t ggml_backend_metal_buffer_type_private_get_max_size(ggml_backend_b return max_size; } +static size_t ggml_backend_metal_buffer_type_private_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) { + return ggml_backend_metal_buffer_type_get_alloc_size(buft, tensor); +} + static bool ggml_backend_metal_buffer_type_private_is_host(ggml_backend_buffer_type_t buft) { return false; @@ -6461,7 +6190,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_private(void) { /* .alloc_buffer = */ ggml_backend_metal_buffer_type_private_alloc_buffer, /* .get_alignment = */ ggml_backend_metal_buffer_type_private_get_alignment, /* .get_max_size = */ ggml_backend_metal_buffer_type_private_get_max_size, - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .get_alloc_size = */ ggml_backend_metal_buffer_type_private_get_alloc_size, /* .is_host = */ ggml_backend_metal_buffer_type_private_is_host, }, /* .device = */ &g_ggml_backend_metal_device, @@ -6496,6 +6225,10 @@ static size_t ggml_backend_metal_buffer_type_mapped_get_max_size(ggml_backend_bu return max_size; } +static size_t ggml_backend_metal_buffer_type_mapped_get_alloc_size(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor) { + return ggml_backend_metal_buffer_type_get_alloc_size(buft, tensor); +} + static bool ggml_backend_metal_buffer_type_mapped_is_host(ggml_backend_buffer_type_t buft) { return false; @@ -6511,7 +6244,7 @@ static ggml_backend_buffer_type_t ggml_backend_metal_buffer_type_mapped(void) { /* .alloc_buffer = */ ggml_backend_metal_buffer_type_mapped_alloc_buffer, /* .get_alignment = */ ggml_backend_metal_buffer_type_mapped_get_alignment, /* .get_max_size = */ ggml_backend_metal_buffer_type_mapped_get_max_size, - /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes + /* .get_alloc_size = */ ggml_backend_metal_buffer_type_mapped_get_alloc_size, /* .is_host = */ ggml_backend_metal_buffer_type_mapped_is_host, }, /* .device = */ &g_ggml_backend_metal_device, @@ -6711,11 +6444,8 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { const int n_nodes_per_cb = ctx->n_nodes_per_cb; id cmd_buf = ctx->cmd_bufs[cb_idx].obj; - struct ggml_metal_mem_pool * mem_pool = ctx->cmd_bufs[cb_idx].mem_pool; struct ggml_mem_ranges * mem_ranges = ctx->cmd_bufs[cb_idx].mem_ranges; - ggml_metal_mem_pool_reset(mem_pool); - if (mem_ranges) { ggml_mem_ranges_reset(mem_ranges); } @@ -6743,7 +6473,6 @@ static void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) { struct ggml_metal_encode_context ctx_enc = { /*.backend =*/ backend, /*.encoder =*/ encoder, - /*.mem_pool =*/ mem_pool, /*.mem_ranges =*/ mem_ranges, };