struct hash_node hash_table[GGML_GRAPH_HASHTABLE_SIZE];
size_t max_size;
bool measure;
+ int parse_seq[GGML_MAX_NODES];
+ bool has_parse_seq;
#ifdef GGML_ALLOCATOR_DEBUG
struct ggml_tensor * allocated_tensors[1024];
alloc->n_free_blocks++;
}
+void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n) {
+ int pos = 0;
+ for (int i = 0; i < n; i++) {
+ if (list[i] != -1) {
+ alloc->parse_seq[pos] = list[i];
+ pos++;
+ }
+ }
+ alloc->has_parse_seq = true;
+}
+
void ggml_allocr_reset(struct ggml_allocr * alloc) {
alloc->n_free_blocks = 1;
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
/*.hash_table = */ {{0}},
/*.max_size = */ 0,
/*.measure = */ false,
+ /*.parse_seq = */ {0},
+ /*.has_parse_seq = */ false,
#ifdef GGML_ALLOCATOR_DEBUG
/*.allocated_tensors = */ = {0},
#endif
/*.hash_table = */ {{0}},
/*.max_size = */ 0,
/*.measure = */ true,
+ /*.parse_seq = */ {0},
+ /*.has_parse_seq = */ false,
#ifdef GGML_ALLOCATOR_DEBUG
/*.allocated_tensors = */ = {0},
#endif
allocate_node(alloc, input);
}
}
- for (int i = 0; i < gf->n_nodes; i++) {
+ for (int ind = 0; ind < gf->n_nodes; ind++) {
+ int i;
+ if (alloc->has_parse_seq) {
+ i = alloc->parse_seq[ind];
+ } else {
+ i = ind;
+ }
struct ggml_tensor * node = gf->nodes[i];
// allocate parents (leafs)
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
+// tell the allocator to parse nodes following the order described in the list
+// you should call this if your graph are optimized to execute out-of-order
+GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, int * list, int n);
+
GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
// try to find operations that can be run concurrently in the graph
// you should run it again if the topology of your graph changes
-void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+void ggml_metal_graph_find_concurrency(struct ggml_metal_context * ctx, struct ggml_cgraph * gf, bool check_mem);
-// if the graph has been optimized for concurrently dispatch
-bool ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+// if the graph has been optimized for concurrently dispatch, return length of the concur_list if optimized
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx);
+
+// output the concur_list for ggml_alloc
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx);
// same as ggml_graph_compute but uses Metal
// creates gf->n_threads command buffers in parallel
ctx->n_cb = n_cb;
}
-bool ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
- if (ctx->concur_list_len) {
- return true;
- }
- return false;
+int ggml_metal_if_optimized(struct ggml_metal_context * ctx) {
+ return ctx->concur_list_len;
+}
+
+int * ggml_metal_get_concur_list(struct ggml_metal_context * ctx) {
+ return ctx->concur_list;
}
// finds the Metal buffer that contains the tensor data on the GPU device
void ggml_metal_graph_find_concurrency(
struct ggml_metal_context * ctx,
- struct ggml_cgraph * gf) {
+ struct ggml_cgraph * gf, bool check_mem) {
int search_depth = gf->n_nodes; //we only find concurrency in this range to avoid wasting too much time
int nodes_unused[GGML_MAX_CONCUR];
}
}
}
- if (exe_flag) {
+ if (exe_flag && check_mem) {
// check if nodes[i]'s data will be overwritten by a node before nodes[i].
// if node[5] and node[3] write to the same memory region, then we can't issue node[5] before node[3]
int64_t data_start = (int64_t) gf->nodes[i]->data;
#define LLAMA_LOG_ERROR(...) llama_log_internal(LLAMA_LOG_LEVEL_ERROR, __VA_ARGS__)
-#if !defined(GGML_USE_CUBLAS) && !defined(GGML_USE_METAL)
+#if !defined(GGML_USE_CUBLAS)
#include "ggml-alloc.h"
#define LLAMA_USE_ALLOCATOR
#else
#ifdef GGML_USE_METAL
if (lctx.ctx_metal) {
- // TODO: disabled until #2413 is resolved
- //if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
- // ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
- //}
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
ggml_metal_graph_compute(lctx.ctx_metal, gf);
ggml_metal_get_tensor (lctx.ctx_metal, res);
int n_past = hparams.n_ctx - n_tokens;
llama_token token = llama_token_bos(); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
ggml_cgraph * gf = llama_build_graph(*ctx, &token, NULL, n_tokens, n_past);
-
+#ifdef GGML_USE_METAL
+ if (params.n_gpu_layers > 0) {
+ ctx->ctx_metal = ggml_metal_init(1);
+ if (!ctx->ctx_metal) {
+ LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
+ llama_free(ctx);
+ return NULL;
+ }
+ ggml_metal_graph_find_concurrency(ctx->ctx_metal, gf, false);
+ ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
+ }
+#endif
// measure memory requirements for the graph
size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf) + tensor_alignment;
ctx->buf_alloc.resize(alloc_size);
ctx->alloc = ggml_allocr_new(ctx->buf_alloc.addr, ctx->buf_alloc.size, tensor_alignment);
+#ifdef GGML_USE_METAL
+ if (ctx->ctx_metal) {
+ ggml_allocr_set_parse_seq(ctx->alloc, ggml_metal_get_concur_list(ctx->ctx_metal), ggml_metal_if_optimized(ctx->ctx_metal));
+ }
+#endif
}
#else
ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
#ifdef GGML_USE_METAL
if (params.n_gpu_layers > 0) {
// this allocates all Metal resources and memory buffers
- ctx->ctx_metal = ggml_metal_init(1);
-
- if (!ctx->ctx_metal) {
- LLAMA_LOG_ERROR("%s: ggml_metal_init() failed\n", __func__);
- llama_free(ctx);
- return NULL;
- }
void * data_ptr = NULL;
size_t data_size = 0;
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "eval", ctx->buf_compute.addr, ctx->buf_compute.size, 0));
LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "kv", ctx->kv_self.buf.addr, ctx->kv_self.buf.size, 0));
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr0", ctx->buf_scratch[0].addr, ctx->buf_scratch[0].size, 0));
- LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "scr1", ctx->buf_scratch[1].addr, ctx->buf_scratch[1].size, 0));
+ LLAMA_METAL_CHECK_BUF(ggml_metal_add_buffer(ctx->ctx_metal, "alloc", ctx->buf_alloc.addr, ctx->buf_alloc.size, 0));
#undef LLAMA_METAL_CHECK_BUF
}
#endif