From: slaren Date: Sun, 11 Feb 2024 12:37:58 +0000 (+0100) Subject: ggml-alloc : v3 (#727) X-Git-Tag: upstream/0.0.1642~973 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=5070f078a67c18c11736e78316ab715ca9afde16;p=pkg%2Fggml%2Fsources%2Fggml ggml-alloc : v3 (#727) * ggml-alloc v3 ggml-ci * fix ci ggml-ci * whisper : check for backend buffer allocation failures * whisper : avoid leaks when initialization fails * cleanup ggml-ci * style fixes ggml-ci --- diff --git a/ci/run.sh b/ci/run.sh index cd9435d9..30463bbd 100644 --- a/ci/run.sh +++ b/ci/run.sh @@ -163,8 +163,9 @@ function gg_run_gpt_2 { model="../models-mnt/gpt-2/ggml-model-gpt-2-117M.bin" prompts="../examples/prompts/gpt-2.txt" - (time ./bin/gpt-2-backend2 --model ${model} -s 1234 -n 64 -tt ${prompts} ) 2>&1 | tee -a $OUT/${ci}-tg.log - (time ./bin/gpt-2-backend2 --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log + (time ./bin/gpt-2-backend --model ${model} -s 1234 -n 64 -tt ${prompts} ) 2>&1 | tee -a $OUT/${ci}-tg.log + (time ./bin/gpt-2-backend --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log + (time ./bin/gpt-2-sched --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log (time ./bin/gpt-2-batched --model ${model} -s 1234 -n 64 -np 8 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log diff --git a/examples/gpt-2/CMakeLists.txt b/examples/gpt-2/CMakeLists.txt index 91f15f0f..a1e9df73 100644 --- a/examples/gpt-2/CMakeLists.txt +++ b/examples/gpt-2/CMakeLists.txt @@ -13,8 +13,8 @@ set(TEST_TARGET gpt-2-backend) add_executable(${TEST_TARGET} main-backend.cpp) target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) -set(TEST_TARGET gpt-2-backend2) -add_executable(${TEST_TARGET} main.cpp) +set(TEST_TARGET gpt-2-sched) +add_executable(${TEST_TARGET} main-sched.cpp) target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml) # diff --git a/examples/gpt-2/main-alloc.cpp b/examples/gpt-2/main-alloc.cpp index c0a68469..b0ddb52a 100644 --- a/examples/gpt-2/main-alloc.cpp +++ b/examples/gpt-2/main-alloc.cpp @@ -1,5 +1,6 @@ #include "ggml/ggml.h" #include "ggml/ggml-alloc.h" +#include "ggml/ggml-backend.h" #include "common.h" #include "common-ggml.h" @@ -69,7 +70,7 @@ struct gpt2_model { struct ggml_tensor * memory_v; // - struct ggml_context * ctx; + struct ggml_context * ctx_w; std::map tensors; }; @@ -153,7 +154,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & return false; } - auto & ctx = model.ctx; + auto & ctx = model.ctx_w; size_t ctx_size = 0; @@ -207,8 +208,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & /*.no_alloc =*/ false, }; - model.ctx = ggml_init(params); - if (!model.ctx) { + model.ctx_w = ggml_init(params); + if (!model.ctx_w) { fprintf(stderr, "%s: ggml_init() failed\n", __func__); return false; } @@ -385,10 +386,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & // build the computation graph struct ggml_cgraph * gpt2_graph( const gpt2_model & model, - struct ggml_allocr * allocr, const int n_past, - const std::vector & embd_inp) { - const int N = embd_inp.size(); + const int n_tokens) { + const int N = n_tokens; const auto & hparams = model.hparams; @@ -404,7 +404,7 @@ struct ggml_cgraph * gpt2_graph( struct ggml_init_params params = { /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf.data(), - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() }; struct ggml_context * ctx0 = ggml_init(params); @@ -412,20 +412,16 @@ struct ggml_cgraph * gpt2_graph( struct ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_allocr_alloc(allocr, embd); - - // avoid writing to tensors if we are only measuring the memory usage - if (!ggml_allocr_is_measure(allocr)) { - memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); - } + // at this point, the tensor data is not allocated yet and cannot be set + // we will find the tensor after the graph is allocated by its name, and set the data then + ggml_set_name(embd, "embd"); + // setting a tensor as an input will ensure that it is allocated at the beginning of the graph + // this is important to ensure that the input tensors are not overwritten before they are used + ggml_set_input(embd); struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_allocr_alloc(allocr, position); - if (!ggml_allocr_is_measure(allocr)) { - for (int i = 0; i < N; ++i) { - ((int32_t *) position->data)[i] = n_past + i; - } - } + ggml_set_name(position, "position"); + ggml_set_input(position); // wte + wpe struct ggml_tensor * inpL = @@ -655,6 +651,9 @@ struct ggml_cgraph * gpt2_graph( // [ 768, 50257] - model.lm_head // [ 768, N] - inpL inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + ggml_set_name(inpL, "logits"); + // setting a tensor as the output will ensure that it is not overwritten by subsequent operations + ggml_set_output(inpL); // logits -> probs //inpL = ggml_soft_max(ctx0, inpL); @@ -669,7 +668,7 @@ struct ggml_cgraph * gpt2_graph( // evaluate the transformer // // - model: the model -// - allocr: ggml_allocr to use to allocate the compute buffer +// - allocr: ggml_gallocr to use to allocate the compute buffer // - n_threads: number of threads to use // - n_past: the context size so far // - embd_inp: the embeddings of the tokens in the context @@ -677,7 +676,7 @@ struct ggml_cgraph * gpt2_graph( // bool gpt2_eval( const gpt2_model & model, - struct ggml_allocr * allocr, + ggml_gallocr_t allocr, const int n_threads, const int n_past, const std::vector & embd_inp, @@ -688,13 +687,19 @@ bool gpt2_eval( const int n_vocab = hparams.n_vocab; - // reset the allocator to free all the memory allocated during the previous inference - ggml_allocr_reset(allocr); + struct ggml_cgraph * gf = gpt2_graph(model, n_past, embd_inp.size()); - struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp); + // allocate the graph tensors + ggml_gallocr_alloc_graph(allocr, gf); - // allocate tensors - ggml_allocr_alloc_graph(allocr, gf); + // set the graph inputs + struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd"); + memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); + + struct ggml_tensor * position = ggml_graph_get_tensor(gf, "position"); + for (int i = 0; i < N; ++i) { + ((int32_t *) position->data)[i] = n_past + i; + } // run the computation struct ggml_cplan plan = ggml_graph_plan(gf, n_threads); @@ -708,15 +713,15 @@ bool gpt2_eval( // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); //} - // in this case, the output tensor is the last one in the graph - struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; + // get the graph outputs + struct ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits"); //embd_w.resize(n_vocab*N); - //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N); + //memcpy(embd_w.data(), ggml_get_data(logits), sizeof(float)*n_vocab*N); // return result just for the last token embd_w.resize(n_vocab); - memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab); + memcpy(embd_w.data(), (float *) ggml_get_data(logits) + (n_vocab*(N-1)), sizeof(float)*n_vocab); return true; } @@ -763,27 +768,19 @@ int main(int argc, char ** argv) { test_gpt_tokenizer(vocab, params.token_test); } - // keep this buffer alive while evaluating the model - std::vector compute_buffer; - - struct ggml_allocr * allocr = NULL; + ggml_gallocr_t allocr = NULL; // allocate the compute buffer { - allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN); + allocr = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); // create the worst case graph for memory usage estimation int n_tokens = std::min(model.hparams.n_ctx, params.n_batch); int n_past = model.hparams.n_ctx - n_tokens; - struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector(n_tokens, 0)); - - // compute the required memory - size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN; - - // recreate the allocator with the required memory - ggml_allocr_free(allocr); - compute_buffer.resize(mem_size); - allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN); + struct ggml_cgraph * gf = gpt2_graph(model, n_past, n_tokens); + // pre-allocate the compute buffer for the worst case (optional) + ggml_gallocr_reserve(allocr, gf); + size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0); fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0); } @@ -880,7 +877,7 @@ int main(int argc, char ** argv) { printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); } - ggml_free(model.ctx); + ggml_free(model.ctx_w); return 0; } diff --git a/examples/gpt-2/main-backend.cpp b/examples/gpt-2/main-backend.cpp index 27591613..cfa618f7 100644 --- a/examples/gpt-2/main-backend.cpp +++ b/examples/gpt-2/main-backend.cpp @@ -87,7 +87,8 @@ struct gpt2_model { struct ggml_tensor * memory_v; // - struct ggml_context * ctx; + struct ggml_context * ctx_w; + struct ggml_context * ctx_kv; ggml_backend_t backend = NULL; @@ -177,7 +178,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & return false; } - auto & ctx = model.ctx; + auto & ctx = model.ctx_w; // create the ggml context { @@ -307,6 +308,24 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & // key + value memory { + auto * ctx = model.ctx_kv; + + // create the ggml context + { + size_t n_tensors = 2; + struct ggml_init_params params = { + /*.mem_size =*/ ggml_tensor_overhead() * n_tensors, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + ctx = ggml_init(params); + if (!ctx) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + const auto & hparams = model.hparams; const int n_embd = hparams.n_embd; @@ -319,25 +338,11 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + // allocate the KV memory in a backend buffer + model.buffer_kv = ggml_backend_alloc_ctx_tensors(ctx, model.backend); + const size_t memory_size = ggml_backend_buffer_get_size(model.buffer_kv); printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); - - // create a backend buffer (can be in host or device memory) - model.buffer_kv = ggml_backend_alloc_buffer(model.backend, memory_size + 256); - - // allocate the tensors into the backend buffer - { - ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv); - - // this updates the pointers in the tensors to point to the correct location in the buffer - // this is necessary since the ggml_context is .no_alloc == true - // note that the buffer can actually be a device buffer, depending on the backend - ggml_allocr_alloc(alloc, model.memory_k); - ggml_allocr_alloc(alloc, model.memory_v); - - ggml_allocr_free(alloc); - } } // load weights @@ -402,12 +407,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & return false; } - if (ggml_backend_is_cpu (model.backend) -#ifdef GGML_USE_METAL - || ggml_backend_is_metal(model.backend) -#endif - ) { - // for the CPU and Metal backend, we can read directly into the tensor + if (ggml_backend_buffer_is_host(model.buffer_w)) { + // for some backends such as CPU and Metal, the tensor data is in system memory and we can read directly into it fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); } else { // read into a temporary buffer first, then copy to device memory @@ -418,7 +419,6 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & // GPT-2 models share the WTE tensor as the LM head if (name == "model/wte" && has_lm_head == false) { - //ggml_allocr_alloc(alloc, model.lm_head); //ggml_backend_tensor_copy(tensor, model.lm_head); model.lm_head = tensor; } @@ -441,10 +441,9 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & // build the computation graph struct ggml_cgraph * gpt2_graph( const gpt2_model & model, - struct ggml_allocr * allocr, const int n_past, - const std::vector & embd_inp) { - const int N = embd_inp.size(); + const int n_tokens) { + const int N = n_tokens; const auto & hparams = model.hparams; @@ -460,35 +459,30 @@ struct ggml_cgraph * gpt2_graph( struct ggml_init_params params = { /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf.data(), - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() }; - struct ggml_context * ctx0 = ggml_init(params); + struct ggml_context * ctx = ggml_init(params); - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false); + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, GPT2_MAX_NODES, false); - struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_allocr_alloc(allocr, embd); + struct ggml_tensor * embd = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); + // at this point, the tensor data is not allocated yet and cannot be set + // we will find the tensor after the graph is allocated by its name, and set the data then + ggml_set_name(embd, "embd"); + // setting a tensor as an input will ensure that it is allocated at the beginning of the graph + // this is important to ensure that the input tensors are not overwritten before they are used + ggml_set_input(embd); - // avoid writing to tensors if we are only measuring the memory usage - if (!ggml_allocr_is_measure(allocr)) { - ggml_backend_tensor_set(embd, embd_inp.data(), 0, N*ggml_element_size(embd)); - } - - struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N); - ggml_allocr_alloc(allocr, position); - if (!ggml_allocr_is_measure(allocr)) { - for (int i = 0; i < N; ++i) { - int32_t v = n_past + i; - ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v)); - } - } + struct ggml_tensor * position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N); + ggml_set_name(position, "position"); + ggml_set_input(position); // wte + wpe struct ggml_tensor * inpL = - ggml_add(ctx0, - ggml_get_rows(ctx0, model.wte, embd), - ggml_get_rows(ctx0, model.wpe, position)); + ggml_add(ctx, + ggml_get_rows(ctx, model.wte, embd), + ggml_get_rows(ctx, model.wpe, position)); for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * cur; @@ -496,12 +490,12 @@ struct ggml_cgraph * gpt2_graph( // norm { // [ 768, N] - cur = ggml_norm(ctx0, inpL, hparams.eps); + cur = ggml_norm(ctx, inpL, hparams.eps); // cur = ln_1_g*cur + ln_1_b // [ 768, N] - cur = ggml_add(ctx0, - ggml_mul(ctx0, + cur = ggml_add(ctx, + ggml_mul(ctx, cur, model.layers[il].ln_1_g), model.layers[il].ln_1_b); @@ -516,45 +510,45 @@ struct ggml_cgraph * gpt2_graph( // cur = attn_w*cur + attn_b // [2304, N] { - cur = ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx, model.layers[il].c_attn_attn_w, cur); - cur = ggml_add(ctx0, + cur = ggml_add(ctx, cur, model.layers[il].c_attn_attn_b); } // self-attention { - struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); - struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); - struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + struct ggml_tensor * Qcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_tensor * Kcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + struct ggml_tensor * Vcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); // store key and value to memory if (N >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * k = ggml_view_1d(ctx, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_1d(ctx, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + ggml_build_forward_expand(gf, ggml_cpy(ctx, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx, Vcur, v)); } // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) // [64, N, 12] struct ggml_tensor * Q = - ggml_permute(ctx0, - ggml_cpy(ctx0, + ggml_permute(ctx, + ggml_cpy(ctx, Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd/n_head, n_head, N)), 0, 2, 1, 3); // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) // [64, n_past + N, 12] struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + ggml_permute(ctx, + ggml_reshape_3d(ctx, + ggml_view_1d(ctx, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), n_embd/n_head, n_head, n_past + N), 0, 2, 1, 3); @@ -572,47 +566,47 @@ struct ggml_cgraph * gpt2_graph( // K * Q // [n_past + N, N, 12] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + struct ggml_tensor * KQ = ggml_mul_mat(ctx, K, Q); // KQ_scaled = KQ / sqrt(n_embd/n_head) // [n_past + N, N, 12] struct ggml_tensor * KQ_scaled = - ggml_scale(ctx0, + ggml_scale(ctx, KQ, 1.0f/sqrtf(float(n_embd)/n_head)); // KQ_masked = mask_past(KQ_scaled) // [n_past + N, N, 12] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx, KQ_scaled, n_past); // KQ = soft_max(KQ_masked) // [n_past + N, N, 12] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx, KQ_masked); // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() // [n_past + N, 64, 12] struct ggml_tensor * V_trans = - ggml_cpy(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + ggml_cpy(ctx, + ggml_permute(ctx, + ggml_reshape_3d(ctx, + ggml_view_1d(ctx, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), n_embd/n_head, n_head, n_past + N), 1, 2, 0, 3), - ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head)); + ggml_new_tensor_3d(ctx, model.memory_v->type, n_past + N, n_embd/n_head, n_head)); // KQV = transpose(V) * KQ_soft_max // [64, N, 12] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); + struct ggml_tensor * KQV = ggml_mul_mat(ctx, V_trans, KQ_soft_max); // KQV_merged = KQV.permute(0, 2, 1, 3) // [64, 12, N] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + struct ggml_tensor * KQV_merged = ggml_permute(ctx, KQV, 0, 2, 1, 3); // cur = KQV_merged.contiguous().view(n_embd, N) // [768, N] - cur = ggml_cpy(ctx0, + cur = ggml_cpy(ctx, KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, N)); } // projection @@ -624,17 +618,17 @@ struct ggml_cgraph * gpt2_graph( // cur = proj_w*cur + proj_b // [768, N] { - cur = ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx, model.layers[il].c_attn_proj_w, cur); - cur = ggml_add(ctx0, + cur = ggml_add(ctx, cur, model.layers[il].c_attn_proj_b); } // add the input - cur = ggml_add(ctx0, cur, inpL); + cur = ggml_add(ctx, cur, inpL); struct ggml_tensor * inpFF = cur; @@ -642,12 +636,12 @@ struct ggml_cgraph * gpt2_graph( { // norm { - cur = ggml_norm(ctx0, inpFF, hparams.eps); + cur = ggml_norm(ctx, inpFF, hparams.eps); // cur = ln_2_g*cur + ln_2_b // [ 768, N] - cur = ggml_add(ctx0, - ggml_mul(ctx0, + cur = ggml_add(ctx, + ggml_mul(ctx, cur, model.layers[il].ln_2_g), model.layers[il].ln_2_b); @@ -661,17 +655,17 @@ struct ggml_cgraph * gpt2_graph( // // cur = fc_w*cur + fc_b // [3072, N] - cur = ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx, model.layers[il].c_mlp_fc_w, cur); - cur = ggml_add(ctx0, + cur = ggml_add(ctx, cur, model.layers[il].c_mlp_fc_b); // GELU activation // [3072, N] - cur = ggml_gelu(ctx0, cur); + cur = ggml_gelu(ctx, cur); // projection // [ 768, 3072] - model.layers[il].c_mlp_proj_w @@ -681,28 +675,28 @@ struct ggml_cgraph * gpt2_graph( // // cur = proj_w*cur + proj_b // [768, N] - cur = ggml_mul_mat(ctx0, + cur = ggml_mul_mat(ctx, model.layers[il].c_mlp_proj_w, cur); - cur = ggml_add(ctx0, + cur = ggml_add(ctx, cur, model.layers[il].c_mlp_proj_b); } // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); + inpL = ggml_add(ctx, cur, inpFF); } // norm { // [ 768, N] - inpL = ggml_norm(ctx0, inpL, hparams.eps); + inpL = ggml_norm(ctx, inpL, hparams.eps); // inpL = ln_f_g*inpL + ln_f_b // [ 768, N] - inpL = ggml_add(ctx0, - ggml_mul(ctx0, + inpL = ggml_add(ctx, + ggml_mul(ctx, inpL, model.ln_f_g), model.ln_f_b); @@ -711,14 +705,17 @@ struct ggml_cgraph * gpt2_graph( // inpL = WTE * inpL // [ 768, 50257] - model.lm_head // [ 768, N] - inpL - inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + inpL = ggml_mul_mat(ctx, model.lm_head, inpL); + ggml_set_name(inpL, "logits"); + // setting a tensor as the output will ensure that it is not overwritten by subsequent operations + ggml_set_output(inpL); // logits -> probs //inpL = ggml_soft_max(ctx0, inpL); ggml_build_forward_expand(gf, inpL); - ggml_free(ctx0); + ggml_free(ctx); return gf; } @@ -726,7 +723,7 @@ struct ggml_cgraph * gpt2_graph( // evaluate the transformer // // - model: the model -// - allocr: ggml_allocr to use to allocate the compute buffer +// - allocr: ggml_gallocr to use to allocate the compute buffer // - n_threads: number of threads to use // - n_past: the context size so far // - embd_inp: the embeddings of the tokens in the context @@ -734,7 +731,7 @@ struct ggml_cgraph * gpt2_graph( // bool gpt2_eval( const gpt2_model & model, - struct ggml_allocr * allocr, + ggml_gallocr_t allocr, const int n_threads, const int n_past, const std::vector & embd_inp, @@ -745,13 +742,20 @@ bool gpt2_eval( const int n_vocab = hparams.n_vocab; - // reset the allocator to free all the memory allocated during the previous inference - ggml_allocr_reset(allocr); + struct ggml_cgraph * gf = gpt2_graph(model, n_past, embd_inp.size()); + + // allocate the graph tensors + ggml_gallocr_alloc_graph(allocr, gf); - struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp); + // set the graph inputs + struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd"); + ggml_backend_tensor_set(embd, embd_inp.data(), 0, N*ggml_element_size(embd)); - // allocate tensors - ggml_allocr_alloc_graph(allocr, gf); + struct ggml_tensor * position = ggml_graph_get_tensor(gf, "position"); + for (int i = 0; i < N; ++i) { + int32_t v = n_past + i; + ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v)); + } // set backend options if (ggml_backend_is_cpu(model.backend)) { @@ -764,60 +768,23 @@ bool gpt2_eval( } #endif - // test -#if 0 && defined(GGML_USE_CUBLAS) - if (ggml_backend_is_cuda(model.backend)) { - auto eval_callback = [](int index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data) { - auto tv1 = tensor_to_float(t1); - auto tv2 = tensor_to_float(t2); - -#if 1 - float sim = cosine_similarity(tv1, tv2); - float len1 = vec_len(tv1); - float len2 = vec_len(tv2); - float lenr = len1/len2; - float lenrd = std::abs(1.0f-lenr); - - float angle = acosf(sim)*180.0f/M_PI; - - if (angle > 0.5f || lenrd > 0.05f) { - printf("%3d [%15s] %s: sim = %f, a = %f, lenrd = %f\n", index, ggml_op_desc(t1), t1->name, sim, angle, lenrd); - } - assert(sim > 0.90f); -#else - float dist = distance(tv1, tv2) / vec_len(tv1); - if (dist > 0.01f) { - printf("%3d [%15s] %s: distance = %f\n", index, ggml_op_desc(t1), t1->name, dist); - } -#endif - - return true; - }; - ggml_backend_t backend_cpu = ggml_backend_cpu_init(); - ggml_backend_compare_graph_backend(model.backend, backend_cpu, gf, eval_callback, nullptr); - ggml_backend_free(backend_cpu); - //printf("done\n"); - } else -#endif - { - // run the computation - ggml_backend_graph_compute(model.backend, gf); - } + // run the computation + ggml_backend_graph_compute(model.backend, gf); //if (n_past%100 == 0) { // ggml_graph_print (&gf); // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); //} - // in this case, the output tensor is the last one in the graph - struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; + // get the graph outputs + struct ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits"); //embd_w.resize(n_vocab*N); - //ggml_backend_tensor_get(inpL, embd_w.data(), 0, sizeof(float)*n_vocab*N); + //ggml_backend_tensor_get(logits, embd_w.data(), 0, sizeof(float)*n_vocab*N); // return result just for the last token embd_w.resize(n_vocab); - ggml_backend_tensor_get(inpL, embd_w.data(), (n_vocab*(N-1))*sizeof(float), sizeof(float)*n_vocab); + ggml_backend_tensor_get(logits, embd_w.data(), (n_vocab*(N-1))*sizeof(float), sizeof(float)*n_vocab); return true; } @@ -864,28 +831,20 @@ int main(int argc, char ** argv) { test_gpt_tokenizer(vocab, params.token_test); } - // keep this buffer alive while evaluating the model - ggml_backend_buffer_t buf_compute; - - struct ggml_allocr * allocr = NULL; + ggml_gallocr_t allocr = NULL; // allocate the compute buffer { - // create an allocator to measure the memory usage - allocr = ggml_allocr_new_measure_from_backend(model.backend); + // create a graph allocator with the backend's default buffer type + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); // create the worst case graph for memory usage estimation int n_tokens = std::min(model.hparams.n_ctx, params.n_batch); int n_past = model.hparams.n_ctx - n_tokens; - struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector(n_tokens, 0)); - - // compute the required memory - size_t mem_size = ggml_allocr_alloc_graph(allocr, gf); - - // recreate the allocator with the required memory - ggml_allocr_free(allocr); - buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size); - allocr = ggml_allocr_new_from_buffer(buf_compute); + struct ggml_cgraph * gf = gpt2_graph(model, n_past, n_tokens); + // pre-allocate the compute buffer for the worst case (optional) + ggml_gallocr_reserve(allocr, gf); + size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0); fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0); } @@ -982,11 +941,11 @@ int main(int argc, char ** argv) { printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); } - ggml_free(model.ctx); + ggml_free(model.ctx_w); + ggml_gallocr_free(allocr); ggml_backend_buffer_free(model.buffer_w); ggml_backend_buffer_free(model.buffer_kv); - ggml_backend_buffer_free(buf_compute); ggml_backend_free(model.backend); return 0; diff --git a/examples/gpt-2/main-batched.cpp b/examples/gpt-2/main-batched.cpp index 02b70760..51094467 100644 --- a/examples/gpt-2/main-batched.cpp +++ b/examples/gpt-2/main-batched.cpp @@ -116,7 +116,7 @@ struct gpt2_model { gpt2_kv_cache kv_cache; - struct ggml_context * ctx; + struct ggml_context * ctx_w; ggml_backend_t backend = NULL; @@ -225,7 +225,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & return false; } - auto & ctx = model.ctx; + auto & ctx = model.ctx_w; size_t buffer_size = 0; @@ -277,8 +277,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & /*.no_alloc =*/ true, }; - model.ctx = ggml_init(params); - if (!model.ctx) { + model.ctx_w = ggml_init(params); + if (!model.ctx_w) { fprintf(stderr, "%s: ggml_init() failed\n", __func__); return false; } @@ -419,21 +419,21 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & // allocate the tensors into the backend buffer { - ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.kv_cache.buffer); + ggml_tallocr * alloc = ggml_tallocr_new(model.kv_cache.buffer); // this updates the pointers in the tensors to point to the correct location in the buffer // this is necessary since the ggml_context is .no_alloc == true // note that the buffer can actually be a device buffer, depending on the backend - ggml_allocr_alloc(alloc, model.kv_cache.k); - ggml_allocr_alloc(alloc, model.kv_cache.v); + ggml_tallocr_alloc(alloc, model.kv_cache.k); + ggml_tallocr_alloc(alloc, model.kv_cache.v); - ggml_allocr_free(alloc); + ggml_tallocr_free(alloc); } } // load weights { - ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_w); + ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_w); size_t total_size = 0; @@ -495,7 +495,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & return false; } - ggml_allocr_alloc(alloc, tensor); + ggml_tallocr_alloc(alloc, tensor); if (ggml_backend_is_cpu (model.backend) #ifdef GGML_USE_METAL @@ -513,7 +513,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & // GPT-2 models share the WTE tensor as the LM head if (name == "model/wte" && has_lm_head == false) { - //ggml_allocr_alloc(alloc, model.lm_head); + //ggml_tallocr_alloc(alloc, model.lm_head); //ggml_backend_tensor_copy(tensor, model.lm_head); model.lm_head = tensor; } @@ -525,7 +525,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & total_size += ggml_nbytes(tensor); } - ggml_allocr_free(alloc); + ggml_tallocr_free(alloc); printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); } @@ -537,8 +537,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & // build the computation graph struct ggml_cgraph * gpt2_graph( const gpt2_model & model, - struct ggml_allocr * allocr, - const gpt2_batch & batch) { + const gpt2_batch & batch, + bool measure) { const auto & hparams = model.hparams; const int n_embd = hparams.n_embd; @@ -549,8 +549,8 @@ struct ggml_cgraph * gpt2_graph( const auto & kv_cache = model.kv_cache; const int32_t n_tokens = batch.n_tokens; - const int32_t n_kv = ggml_allocr_is_measure(allocr) ? n_ctx : kv_cache.n; - const int32_t kv_head = ggml_allocr_is_measure(allocr) ? n_ctx - n_tokens : kv_cache.head; + const int32_t n_kv = measure ? n_ctx : kv_cache.n; + const int32_t kv_head = measure ? n_ctx - n_tokens : kv_cache.head; // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data static size_t buf_size = ggml_tensor_overhead()*GPT2_MAX_NODES + ggml_graph_overhead_custom(GPT2_MAX_NODES, false); @@ -559,7 +559,7 @@ struct ggml_cgraph * gpt2_graph( struct ggml_init_params params = { /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf.data(), - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() }; struct ggml_context * ctx0 = ggml_init(params); @@ -569,19 +569,12 @@ struct ggml_cgraph * gpt2_graph( struct ggml_tensor * inpL; if (batch.token) { struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_allocr_alloc(allocr, inp_tokens); - if (!ggml_allocr_is_measure(allocr)) { - ggml_backend_tensor_set(inp_tokens, batch.token, 0, n_tokens*ggml_element_size(inp_tokens)); - } + ggml_set_name(inp_tokens, "inp_tokens"); + ggml_set_input(inp_tokens); struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_allocr_alloc(allocr, position); - if (!ggml_allocr_is_measure(allocr)) { - for (int i = 0; i < n_tokens; ++i) { - int32_t v = batch.pos[i]; - ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v)); - } - } + ggml_set_name(position, "position"); + ggml_set_input(position); // wte + wpe inpL = @@ -592,37 +585,15 @@ struct ggml_cgraph * gpt2_graph( GGML_ASSERT(batch.embd); inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens); - - ggml_allocr_alloc(allocr, inpL); - if (!ggml_allocr_is_measure(allocr)) { - ggml_backend_tensor_set(inpL, batch.embd, 0, n_tokens * n_embd * ggml_element_size(inpL)); - } + ggml_set_name(inpL, "embd"); + ggml_set_input(inpL); } // KQ_mask (mask for 1 head, it will be broadcasted to all heads) struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); ggml_set_name(KQ_mask, "KQ_mask"); - ggml_allocr_alloc(allocr, KQ_mask); - if (!ggml_allocr_is_measure(allocr)) { - std::vector data_buf(n_kv*n_tokens); - const float neg_inf_v = -INFINITY; + ggml_set_input(KQ_mask); - for (int h = 0; h < 1; ++h) { - int h_offset = h*(n_kv*n_tokens); - for (int j = 0; j < n_tokens; ++j) { - const gpt2_pos pos = batch.pos[j]; - const gpt2_seq_id seq_id = batch.seq_id[j]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_cache.cells[i].has_seq_id(seq_id) || kv_cache.cells[i].pos > pos) { - data_buf[h_offset + j*n_kv + i] = neg_inf_v; - } - } - } - } - - ggml_backend_tensor_set(KQ_mask, data_buf.data(), 0, data_buf.size() * sizeof(float)); - } for (int il = 0; il < n_layer; ++il) { struct ggml_tensor * cur; @@ -901,8 +872,8 @@ void gpt2_batch_free(struct gpt2_batch batch) { // 0 - success // < 0 - error int gpt2_decode( - struct gpt2_model & model, - struct ggml_allocr * allocr, + struct gpt2_model & model, + ggml_gallocr_t allocr, struct gpt2_batch batch, int n_threads, std::vector & logits) { @@ -926,13 +897,51 @@ int gpt2_decode( cache.n = cache.head + n_tokens; - // reset the allocator to free all the memory allocated during the previous inference - ggml_allocr_reset(allocr); - - struct ggml_cgraph * gf = gpt2_graph(model, allocr, batch); + struct ggml_cgraph * gf = gpt2_graph(model, batch, false); // allocate tensors - ggml_allocr_alloc_graph(allocr, gf); + ggml_gallocr_alloc_graph(allocr, gf); + + // set the graph inputs + if (batch.token) { + struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens"); + ggml_backend_tensor_set(inp_tokens, batch.token, 0, n_tokens*ggml_element_size(inp_tokens)); + + struct ggml_tensor * position = ggml_graph_get_tensor(gf, "position"); + for (int i = 0; i < n_tokens; ++i) { + int32_t v = batch.pos[i]; + ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v)); + } + } else { + struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd"); + ggml_backend_tensor_set(embd, batch.embd, 0, n_tokens * hparams.n_embd * ggml_element_size(embd)); + } + + { + struct ggml_tensor * KQ_mask = ggml_graph_get_tensor(gf, "KQ_mask"); + const auto & kv_cache = model.kv_cache; + const int32_t n_tokens = batch.n_tokens; + const int32_t n_kv = kv_cache.n; + + std::vector data_buf(n_kv*n_tokens); + const float neg_inf_v = -INFINITY; + + for (int h = 0; h < 1; ++h) { + int h_offset = h*(n_kv*n_tokens); + for (int j = 0; j < n_tokens; ++j) { + const gpt2_pos pos = batch.pos[j]; + const gpt2_seq_id seq_id = batch.seq_id[j]; + + for (int i = 0; i < n_kv; ++i) { + if (!kv_cache.cells[i].has_seq_id(seq_id) || kv_cache.cells[i].pos > pos) { + data_buf[h_offset + j*n_kv + i] = neg_inf_v; + } + } + } + } + + ggml_backend_tensor_set(KQ_mask, data_buf.data(), 0, data_buf.size() * sizeof(float)); + } // run the computation if (ggml_backend_is_cpu(model.backend)) { @@ -1024,9 +1033,6 @@ int main(int argc, char ** argv) { // tokenize the prompt std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); - // keep this buffer alive while evaluating the model - ggml_backend_buffer_t buf_compute; - const int n_parallel = params.n_parallel; const int n_batch_max = std::max(embd_inp.size(), (size_t)n_parallel); @@ -1035,24 +1041,18 @@ int main(int argc, char ** argv) { gpt2_batch batch = gpt2_batch_init(n_batch_max, 0); // prepare required memory and allocate the compute buffer - struct ggml_allocr * allocr = NULL; + ggml_gallocr_t allocr = NULL; { // create an allocator to measure the memory usage - allocr = ggml_allocr_new_measure_from_backend(model.backend); - - batch.n_tokens = n_batch_max; + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); // create the worst case graph for memory usage estimation - struct ggml_cgraph * gf = gpt2_graph(model, allocr, batch); - - // compute the required memory - size_t mem_size = ggml_allocr_alloc_graph(allocr, gf); - - // recreate the allocator with the required memory - ggml_allocr_free(allocr); - buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size); - allocr = ggml_allocr_new_from_buffer(buf_compute); + batch.n_tokens = n_batch_max; + struct ggml_cgraph * gf = gpt2_graph(model, batch, true); + // pre-allocate the compute buffer for the worst case (optional) + ggml_gallocr_reserve(allocr, gf); + size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0); fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0); } @@ -1207,11 +1207,11 @@ int main(int argc, char ** argv) { } gpt2_batch_free(batch); - ggml_free(model.ctx); + ggml_free(model.ctx_w); + ggml_gallocr_free(allocr); ggml_backend_buffer_free(model.buffer_w); ggml_backend_buffer_free(model.kv_cache.buffer); - ggml_backend_buffer_free(buf_compute); ggml_backend_free(model.backend); return 0; diff --git a/examples/gpt-2/main-ctx.cpp b/examples/gpt-2/main-ctx.cpp index 2c075f38..5dd11417 100644 --- a/examples/gpt-2/main-ctx.cpp +++ b/examples/gpt-2/main-ctx.cpp @@ -68,7 +68,7 @@ struct gpt2_model { struct ggml_tensor * memory_v; // - struct ggml_context * ctx; + struct ggml_context * ctx_w; std::map tensors; }; @@ -152,7 +152,7 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & return false; } - auto & ctx = model.ctx; + auto & ctx = model.ctx_w; size_t ctx_size = 0; @@ -206,8 +206,8 @@ bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & /*.no_alloc =*/ false, }; - model.ctx = ggml_init(params); - if (!model.ctx) { + model.ctx_w = ggml_init(params); + if (!model.ctx_w) { fprintf(stderr, "%s: ggml_init() failed\n", __func__); return false; } @@ -834,7 +834,7 @@ int main(int argc, char ** argv) { printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); } - ggml_free(model.ctx); + ggml_free(model.ctx_w); return 0; } diff --git a/examples/gpt-2/main-sched.cpp b/examples/gpt-2/main-sched.cpp new file mode 100644 index 00000000..b5b8af61 --- /dev/null +++ b/examples/gpt-2/main-sched.cpp @@ -0,0 +1,1071 @@ +#include "ggml/ggml.h" +#include "ggml/ggml-alloc.h" +#include "ggml/ggml-backend.h" + +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#include "common.h" +#include "common-ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#if defined(_MSC_VER) +#pragma warning(disable: 4244 4267) // possible loss of data +#endif + +#define GPT2_MAX_NODES 4096 + +static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) { + (void) level; + (void) user_data; + fputs(text, stderr); + fflush(stderr); +} + +// default hparams (GPT-2 117M) +struct gpt2_hparams { + int32_t n_vocab = 50257; + int32_t n_ctx = 1024; + int32_t n_embd = 768; + int32_t n_head = 12; + int32_t n_layer = 12; + int32_t ftype = 1; + float eps = 1e-5f; +}; + +struct gpt2_layer { + // normalization + struct ggml_tensor * ln_1_g; + struct ggml_tensor * ln_1_b; + + struct ggml_tensor * ln_2_g; + struct ggml_tensor * ln_2_b; + + // attention + struct ggml_tensor * c_attn_attn_w; + struct ggml_tensor * c_attn_attn_b; + + struct ggml_tensor * c_attn_proj_w; + struct ggml_tensor * c_attn_proj_b; + + // mlp + struct ggml_tensor * c_mlp_fc_w; + struct ggml_tensor * c_mlp_fc_b; + + struct ggml_tensor * c_mlp_proj_w; + struct ggml_tensor * c_mlp_proj_b; +}; + +struct gpt2_model { + gpt2_hparams hparams; + + // normalization + struct ggml_tensor * ln_f_g; + struct ggml_tensor * ln_f_b; + + struct ggml_tensor * wte; // position embedding + struct ggml_tensor * wpe; // token embedding + struct ggml_tensor * lm_head; // language model head + + std::vector layers; + + // key + value memory + struct ggml_tensor * memory_k; + struct ggml_tensor * memory_v; + + // + struct ggml_context * ctx_w; + + std::vector backends; + std::vector buffers_w; + ggml_backend_buffer_t buffer_kv; + ggml_backend_buffer_t buffer_input; + + std::map tensors; + + // inputs/constants + struct ggml_tensor * embd; + struct ggml_tensor * position; +}; + +void init_backends(gpt2_model & model, const gpt_params & params) { + ggml_backend_t gpu_backend = NULL; + + // initialize the backends +#ifdef GGML_USE_CUBLAS + if (params.n_gpu_layers > 0) { + fprintf(stderr, "%s: using CUDA backend\n", __func__); + gpu_backend = ggml_backend_cuda_init(0); + if (!gpu_backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } + } +#endif + +#ifdef GGML_USE_METAL + if (params.n_gpu_layers > 0) { + fprintf(stderr, "%s: using Metal backend\n", __func__); + ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr); + gpu_backend = ggml_backend_metal_init(); + if (!gpu_backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } else { + ggml_backend_metal_set_n_cb(gpu_backend, params.n_threads); + } + } +#endif + if (gpu_backend) { + model.backends.push_back(gpu_backend); + } + + // always add the CPU backend as a fallback + ggml_backend_t cpu_backend = ggml_backend_cpu_init(); + ggml_backend_cpu_set_n_threads(cpu_backend, params.n_threads); + model.backends.push_back(cpu_backend); +} + +// load the model's weights from a file +bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, const gpt_params & params) { + printf("%s: loading model from '%s'\n", __func__, fname.c_str()); + + auto fin = std::ifstream(fname, std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); + return false; + } + + // verify magic + { + uint32_t magic; + fin.read((char *) &magic, sizeof(magic)); + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); + return false; + } + } + + // load hparams + { + auto & hparams = model.hparams; + + fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); + fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); + fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); + fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); + fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); + fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); + + const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; + + printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); + printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); + printf("%s: n_embd = %d\n", __func__, hparams.n_embd); + printf("%s: n_head = %d\n", __func__, hparams.n_head); + printf("%s: n_layer = %d\n", __func__, hparams.n_layer); + printf("%s: ftype = %d\n", __func__, hparams.ftype); + printf("%s: qntvr = %d\n", __func__, qntvr); + + hparams.ftype %= GGML_QNT_VERSION_FACTOR; + } + + // load vocab + { + int32_t n_vocab = 0; + fin.read((char *) &n_vocab, sizeof(n_vocab)); + + if (n_vocab != model.hparams.n_vocab) { + fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", + __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); + return false; + } + + std::string word; + std::vector buf(128); + + for (int i = 0; i < n_vocab; i++) { + uint32_t len; + fin.read((char *) &len, sizeof(len)); + + buf.resize(len); + fin.read((char *) buf.data(), len); + word.assign(buf.data(), len); + + vocab.token_to_id[word] = i; + vocab.id_to_token[i] = word; + } + } + + // for the big tensors, we have the option to store the data in 16-bit floats or quantized + // in order to save memory and also to speed up the computation + ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); + if (wtype == GGML_TYPE_COUNT) { + fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", + __func__, fname.c_str(), model.hparams.ftype); + return false; + } + + auto & ctx = model.ctx_w; + + // create the ggml context + { + size_t n_tensors = 3 /* input */ + 2 /* kv */ + 6 + 12*model.hparams.n_layer; + struct ggml_init_params params = { + /*.mem_size =*/ ggml_tensor_overhead() * n_tensors, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + model.ctx_w = ggml_init(params); + if (!model.ctx_w) { + fprintf(stderr, "%s: ggml_init() failed\n", __func__); + return false; + } + } + + // create tensors for the weights + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_vocab = hparams.n_vocab; + + model.layers.resize(n_layer); + + model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx); + model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); + + // map by name + model.tensors["model/ln_f/g"] = model.ln_f_g; + model.tensors["model/ln_f/b"] = model.ln_f_b; + + model.tensors["model/wte"] = model.wte; + model.tensors["model/wpe"] = model.wpe; + model.tensors["model/lm_head"] = model.lm_head; + + for (int i = 0; i < n_layer; ++i) { + auto & layer = model.layers[i]; + + layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); + layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); + + layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); + layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); + layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); + + layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); + layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); + + // map by name + model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g; + model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b; + + model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g; + model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b; + + model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w; + model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b; + + model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w; + model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b; + + model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w; + model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b; + + model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w; + model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b; + } + } + + // assign tensors to backends + init_backends(model, params); + ggml_backend_t backend_gpu = model.backends.front(); + ggml_backend_t backend_cpu = model.backends.back(); + std::map tensor_backends; + { + const int i_gpu_first_layer = model.hparams.n_layer - params.n_gpu_layers; + for (auto it : model.tensors) { + const std::string & name = it.first; + // input tensors + if (name == "model/wte" || name == "model/wpe") { + if (params.n_gpu_layers > model.hparams.n_layer) { + tensor_backends[name] = backend_gpu; + } else { + tensor_backends[name] = backend_cpu; + } + } + // output tensors + if (name == "model/ln_f/g" || name == "model/ln_f/b" || name == "model/lm_head") { + if (params.n_gpu_layers > 0) { + tensor_backends[name] = backend_gpu; + } else { + tensor_backends[name] = backend_cpu; + } + } + // layer tensors + if (name.substr(0, 7) == "model/h") { + // parse layer number + int layer = std::stoi(name.substr(7, 2)); + if (layer >= i_gpu_first_layer) { + tensor_backends[name] = backend_gpu; + } else { + tensor_backends[name] = backend_cpu; + } + } + } + } + + // allocate buffers + std::map> backend_buffers; + for (auto backend : model.backends) { + // compute the size of the buffer + size_t size = 0; + for (auto it : model.tensors) { + if (tensor_backends[it.first] == backend) { + size += ggml_nbytes(it.second) + 512; + } + } + if (size > 0) { + printf("%s: %8s buffer size = %8.2f MB\n", __func__, ggml_backend_name(backend), size/1024.0/1024.0); + // allocate the buffer + ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size); + ggml_backend_buffer_set_usage(buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS); + model.buffers_w.push_back(buffer); + + // create an allocator for the buffer to allocate the tensors + auto alloc = std::unique_ptr(ggml_tallocr_new(buffer), ggml_tallocr_free); + backend_buffers.insert(std::make_pair(backend, std::move(alloc))); + } else { + model.buffers_w.push_back(NULL); + } + } + + // allocate key + value memory + { + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + + const int n_mem = n_layer*n_ctx; + const int n_elements = n_embd*n_mem; + + model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); + model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); + + ggml_set_name(model.memory_k, "model/memory_k"); + ggml_set_name(model.memory_v, "model/memory_v"); + + const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); + + printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); + + // create a backend buffer (can be in host or device memory) + ggml_backend_t backend_kv = params.n_gpu_layers >= hparams.n_layer/2 ? backend_gpu : backend_cpu; + printf("%s: backend_kv = %s\n", __func__, ggml_backend_name(backend_kv)); + model.buffer_kv = ggml_backend_alloc_buffer(backend_kv, memory_size + 512*2); + + // allocate the tensors into the backend buffer + { + ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_kv); + + // this updates the pointers in the tensors to point to the correct location in the buffer + // this is necessary since the ggml_context is .no_alloc == true + // note that the buffer can actually be a device buffer, depending on the backend + ggml_tallocr_alloc(alloc, model.memory_k); + ggml_tallocr_alloc(alloc, model.memory_v); + + ggml_tallocr_free(alloc); + } + } + + // load weights + { + size_t total_size = 0; + + bool has_lm_head = false; + + std::vector read_buf; + + while (true) { + int32_t n_dims; + int32_t length; + int32_t ttype; + + fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); + fin.read(reinterpret_cast(&length), sizeof(length)); + fin.read(reinterpret_cast(&ttype), sizeof(ttype)); + + if (fin.eof()) { + break; + } + + int32_t nelements = 1; + int32_t ne[2] = { 1, 1 }; + for (int i = 0; i < n_dims; ++i) { + fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); + nelements *= ne[i]; + } + + std::string name(length, 0); + fin.read(&name[0], length); + + if (model.tensors.find(name) == model.tensors.end()) { + fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); + return false; + } + + auto tensor = model.tensors[name]; + ggml_set_name(tensor, name.c_str()); + if (ggml_nelements(tensor) != nelements) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str()); + return false; + } + + if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { + fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", + __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); + return false; + } + + // for debugging + if (0) { + printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); + } + + const size_t bpe = ggml_type_size(ggml_type(ttype)); + + if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { + fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", + __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe); + return false; + } + + // allocate the tensor + ggml_backend_t backend = tensor_backends[name]; + ggml_tallocr * alloc = backend_buffers.find(backend)->second.get(); + ggml_tallocr_alloc(alloc, tensor); + //printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str()); + + if (ggml_backend_is_cpu(backend) +#ifdef GGML_USE_METAL + || ggml_backend_is_metal(backend) +#endif + ) { + // for the CPU and Metal backend, we can read directly into the tensor + fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); + } else { + // read into a temporary buffer first, then copy to device memory + read_buf.resize(ggml_nbytes(tensor)); + fin.read(read_buf.data(), ggml_nbytes(tensor)); + ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor)); + } + + // GPT-2 models share the WTE tensor as the LM head + if (name == "model/wte" && has_lm_head == false) { + ggml_tallocr_alloc(backend_buffers.find(tensor_backends["model/lm_head"])->second.get(), model.lm_head); + //printf("%s: [%5.5s] %s (copied)\n", __func__, ggml_backend_name(tensor_backends["model/lm_head"]), "model/lm_head"); + ggml_backend_tensor_copy(tensor, model.lm_head); + total_size += ggml_nbytes(model.lm_head); + } + + if (name == "model/lm_head") { + has_lm_head = true; + } + + total_size += ggml_nbytes(tensor); + } + printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); + } + + fin.close(); + + // allocate input tensors + { + model.embd = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model.hparams.n_ctx); + model.position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model.hparams.n_ctx); + + ggml_set_name(model.embd, "in/embd"); + ggml_set_name(model.position, "in/position"); + + // add input tensors to cpu backend + size_t input_size = ggml_nbytes(model.embd) + ggml_nbytes(model.position); + + // FIXME: use cpu backend after sched impl + ggml_backend_t backend_input = params.n_gpu_layers >= model.hparams.n_layer ? backend_gpu : backend_cpu; + model.buffer_input = ggml_backend_alloc_buffer(backend_input, input_size + 512*3); + printf("%s: backend_in = %s (%zu bytes)\n", __func__, ggml_backend_name(backend_input), input_size); + + // allocate the tensors into the backend buffer + ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_input); + ggml_tallocr_alloc(alloc, model.embd); + ggml_tallocr_alloc(alloc, model.position); + ggml_tallocr_free(alloc); + } + + return true; +} + +// build the computation graph +struct ggml_cgraph * gpt2_graph( + const gpt2_model & model, + const int n_past, + const std::vector & embd_inp) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_embd = hparams.n_embd; + const int n_layer = hparams.n_layer; + const int n_ctx = hparams.n_ctx; + const int n_head = hparams.n_head; + + // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data + static size_t buf_size = ggml_tensor_overhead()*GPT2_MAX_NODES + ggml_graph_overhead_custom(GPT2_MAX_NODES, false); + static std::vector buf(buf_size); + + struct ggml_init_params params = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() + }; + + struct ggml_context * ctx0 = ggml_init(params); + + struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false); + + struct ggml_tensor * embd = ggml_view_1d(ctx0, model.embd, N, 0); + + // set inputs + // TODO: move to gpt2_eval + ggml_backend_tensor_set(model.embd, embd_inp.data(), 0, N*ggml_element_size(embd)); + + struct ggml_tensor * position = ggml_view_1d(ctx0, model.position, N, 0); + for (int i = 0; i < N; ++i) { + int32_t v = n_past + i; + ggml_backend_tensor_set(model.position, &v, i*sizeof(int32_t), sizeof(v)); + } + + const float KQ_scale = 1.0f/sqrtf(float(model.hparams.n_embd)/model.hparams.n_head); + + // wte + wpe + struct ggml_tensor * inpL = + ggml_add(ctx0, + ggml_get_rows(ctx0, model.wte, embd), + ggml_get_rows(ctx0, model.wpe, position)); + ggml_set_name(inpL, "inpL"); + ggml_set_name(inpL->src[0], "wte"); + ggml_set_name(inpL->src[1], "wpe"); + + for (int il = 0; il < n_layer; ++il) { + struct ggml_tensor * cur; + + // norm + { + // [ 768, N] + cur = ggml_norm(ctx0, inpL, hparams.eps); + ggml_format_name(cur, "l%d.norm", il); + + // cur = ln_1_g*cur + ln_1_b + // [ 768, N] + cur = ggml_add(ctx0, + ggml_mul(ctx0, + cur, + model.layers[il].ln_1_g), + model.layers[il].ln_1_b); + ggml_format_name(cur, "l%d.ln_1_b", il); + ggml_format_name(cur->src[0], "l%d.ln_1_g", il); + } + + // attn + // [2304, 768] - model.layers[il].c_attn_attn_w + // [2304, 1] - model.layers[il].c_attn_attn_b + // [ 768, N] - cur (in) + // [2304, N] - cur (out) + // + // cur = attn_w*cur + attn_b + // [2304, N] + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_attn_w, + cur); + ggml_format_name(cur, "l%d.attn_w", il); + + cur = ggml_add(ctx0, + cur, + model.layers[il].c_attn_attn_b); + ggml_format_name(cur, "l%d.attn_b", il); + } + + // self-attention + { + struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); + struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); + struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); + + ggml_format_name(Qcur, "l%d.Qcur", il); + ggml_format_name(Kcur, "l%d.Kcur", il); + ggml_format_name(Vcur, "l%d.Vcur", il); + + // store key and value to memory + if (N >= 1) { + struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); + struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); + + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); + ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); + } + + // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) + // [64, N, 12] + struct ggml_tensor * Q = + ggml_permute(ctx0, + ggml_cpy(ctx0, + Qcur, + ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), + 0, 2, 1, 3); + ggml_format_name(Q, "l%d.Q", il); + + // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) + // [64, n_past + N, 12] + struct ggml_tensor * K = + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), + n_embd/n_head, n_head, n_past + N), + 0, 2, 1, 3); + ggml_format_name(K, "l%d.K", il); + + // GG: flash attention + //struct ggml_tensor * V = + // ggml_cpy(ctx0, + // ggml_permute(ctx0, + // ggml_reshape_3d(ctx0, + // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + // n_embd/n_head, n_head, n_past + N), + // 1, 2, 0, 3), + // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); + + //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); + + // K * Q + // [n_past + N, N, 12] + struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); + ggml_format_name(KQ, "l%d.KQ", il); + + // KQ_scaled = KQ / sqrt(n_embd/n_head) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); + ggml_format_name(KQ_scaled, "l%d.KQ_scaled", il); + + // KQ_masked = mask_past(KQ_scaled) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); + ggml_format_name(KQ_masked, "l%d.KQ_masked", il); + + // KQ = soft_max(KQ_masked) + // [n_past + N, N, 12] + struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); + ggml_format_name(KQ_soft_max, "l%d.KQ_soft_max", il); + + // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() + // [n_past + N, 64, 12] + struct ggml_tensor * V_trans = + ggml_cpy(ctx0, + ggml_permute(ctx0, + ggml_reshape_3d(ctx0, + ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), + n_embd/n_head, n_head, n_past + N), + 1, 2, 0, 3), + ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head)); + ggml_format_name(V_trans, "l%d.V_trans", il); + + // KQV = transpose(V) * KQ_soft_max + // [64, N, 12] + struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); + ggml_format_name(KQV, "l%d.KQV", il); + + // KQV_merged = KQV.permute(0, 2, 1, 3) + // [64, 12, N] + struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); + ggml_format_name(KQV_merged, "l%d.KQV_merged", il); + + // cur = KQV_merged.contiguous().view(n_embd, N) + // [768, N] + cur = ggml_cpy(ctx0, + KQV_merged, + ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); + ggml_format_name(cur, "l%d.KQV_merged_contiguous", il); + } + + // projection + // [ 768, 768] - model.layers[il].c_attn_proj_w + // [ 768, 1] - model.layers[il].c_attn_proj_b + // [ 768, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + { + cur = ggml_mul_mat(ctx0, + model.layers[il].c_attn_proj_w, + cur); + ggml_format_name(cur, "l%d.attn_proj_w", il); + + cur = ggml_add(ctx0, + cur, + model.layers[il].c_attn_proj_b); + ggml_format_name(cur, "l%d.attn_proj_b", il); + } + + // add the input + cur = ggml_add(ctx0, cur, inpL); + ggml_format_name(cur, "l%d.add", il); + + struct ggml_tensor * inpFF = cur; + + // feed-forward network + { + // norm + { + cur = ggml_norm(ctx0, inpFF, hparams.eps); + ggml_format_name(cur, "l%d.FFnorm", il); + + // cur = ln_2_g*cur + ln_2_b + // [ 768, N] + cur = ggml_add(ctx0, + ggml_mul(ctx0, + cur, + model.layers[il].ln_2_g), + model.layers[il].ln_2_b); + ggml_format_name(cur, "l%d.ln_2_b", il); + ggml_format_name(cur->src[0], "l%d.ln_2_g", il); + } + + // fully connected + // [3072, 768] - model.layers[il].c_mlp_fc_w + // [3072, 1] - model.layers[il].c_mlp_fc_b + // [ 768, N] - cur (in) + // [3072, N] - cur (out) + // + // cur = fc_w*cur + fc_b + // [3072, N] + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_fc_w, + cur); + ggml_format_name(cur, "l%d.mlp_fc_w", il); + + cur = ggml_add(ctx0, + cur, + model.layers[il].c_mlp_fc_b); + ggml_format_name(cur, "l%d.mlp_fc_b", il); + + // GELU activation + // [3072, N] + cur = ggml_gelu(ctx0, cur); + ggml_format_name(cur, "l%d.gelu", il); + + // projection + // [ 768, 3072] - model.layers[il].c_mlp_proj_w + // [ 768, 1] - model.layers[il].c_mlp_proj_b + // [3072, N] - cur (in) + // [ 768, N] - cur (out) + // + // cur = proj_w*cur + proj_b + // [768, N] + cur = ggml_mul_mat(ctx0, + model.layers[il].c_mlp_proj_w, + cur); + ggml_format_name(cur, "l%d.mlp_proj_w", il); + + cur = ggml_add(ctx0, + cur, + model.layers[il].c_mlp_proj_b); + ggml_format_name(cur, "l%d.mlp_proj_b", il); + } + + // input for next layer + inpL = ggml_add(ctx0, cur, inpFF); + ggml_format_name(inpL, "l%d.add2", il); + } + + // norm + { + // [ 768, N] + inpL = ggml_norm(ctx0, inpL, hparams.eps); + ggml_format_name(inpL, "out_norm"); + + // inpL = ln_f_g*inpL + ln_f_b + // [ 768, N] + inpL = ggml_add(ctx0, + ggml_mul(ctx0, + inpL, + model.ln_f_g), + model.ln_f_b); + ggml_format_name(inpL, "out_ln_f_b"); + ggml_format_name(inpL->src[0], "out_ln_f_g"); + } + + // inpL = WTE * inpL + // [ 768, 50257] - model.lm_head + // [ 768, N] - inpL + inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); + ggml_format_name(inpL, "out_lm_head"); + + // logits -> probs + //inpL = ggml_soft_max(ctx0, inpL); + + ggml_build_forward_expand(gf, inpL); + + ggml_free(ctx0); + + return gf; +} + +// evaluate the transformer +// +// - model: the model +// - sched: the backend scheduler +// - n_past: the context size so far +// - embd_inp: the embeddings of the tokens in the context +// - embd_w: the predicted logits for the next token +// +bool gpt2_eval( + const gpt2_model & model, + ggml_backend_sched_t sched, + const int n_past, + const std::vector & embd_inp, + std::vector & embd_w) { + const int N = embd_inp.size(); + + const auto & hparams = model.hparams; + + const int n_vocab = hparams.n_vocab; + + struct ggml_cgraph * gf = gpt2_graph(model, n_past, embd_inp); + + // run the computation + ggml_backend_sched_graph_compute(sched, gf); + + //if (n_past%100 == 0) { + // ggml_graph_print (&gf); + // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); + //} + + // in this case, the output tensor is the last one in the graph + struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; + + //embd_w.resize(n_vocab*N); + //ggml_backend_tensor_get(inpL, embd_w.data(), 0, sizeof(float)*n_vocab*N); + + // return result just for the last token + embd_w.resize(n_vocab); + ggml_backend_tensor_get(inpL, embd_w.data(), (n_vocab*(N-1))*sizeof(float), sizeof(float)*n_vocab); + + return true; +} + +int main(int argc, char ** argv) { + ggml_time_init(); + + const int64_t t_main_start_us = ggml_time_us(); + + gpt_params params; + params.model = "models/gpt-2-117M/ggml-model.bin"; + + if (gpt_params_parse(argc, argv, params) == false) { + return 1; + } + + if (params.seed < 0) { + params.seed = time(NULL); + } + + printf("%s: seed = %d\n", __func__, params.seed); + + std::mt19937 rng(params.seed); + if (params.prompt.empty()) { + params.prompt = gpt_random_prompt(rng); + } + + int64_t t_load_us = 0; + + gpt_vocab vocab; + gpt2_model model; + + // load the model + { + const int64_t t_start_us = ggml_time_us(); + + if (!gpt2_model_load(params.model, model, vocab, params)) { + fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); + return 1; + } + + t_load_us = ggml_time_us() - t_start_us; + + test_gpt_tokenizer(vocab, params.token_test); + } + + // create the backend scheduler + // the scheduler handles the allocation of the compute buffers and the scheduling of the computation between the different backends + ggml_backend_sched_t sched; + { + // initialize the scheduler + sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES); + + // create the worst case graph for memory usage estimation + int n_tokens = std::min(model.hparams.n_ctx, params.n_batch); + int n_past = model.hparams.n_ctx - n_tokens; + struct ggml_cgraph * gf = gpt2_graph(model, n_past, std::vector(n_tokens, 0)); + + ggml_backend_sched_reserve(sched, gf); + + + // compute the required memory + size_t mem_size = 0; + for (size_t i = 0; i < model.backends.size(); i++) { + size_t size = ggml_backend_sched_get_buffer_size(sched, model.backends[i]); + if (size > 0) { + mem_size += size; + printf("%s: %8s compute buffer size = %8.2f MB\n", __func__, ggml_backend_name(model.backends[i]), size/1024.0/1024.0); + //printf("%s: %8s compute buffer size = %zu bytes\n", __func__, ggml_backend_name(model.backends[i]), size); + } + } + + printf("%s: total compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0); + } + + int n_past = 0; + + int64_t t_sample_us = 0; + int64_t t_predict_us = 0; + + std::vector logits; + + // tokenize the prompt + std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); + + params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); + + printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); + printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size()); + for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) { + printf("%d ", embd_inp[i]); + } + printf("\n\n"); + + // submit the input prompt token-by-token + // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning + std::vector embd; + + for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { + // predict + if (embd.size() > 0) { + const int64_t t_start_us = ggml_time_us(); + + if (!gpt2_eval(model, sched, n_past, embd, logits)) { + printf("Failed to predict\n"); + return 1; + } + + t_predict_us += ggml_time_us() - t_start_us; + } + + n_past += embd.size(); + embd.clear(); + + if (i >= embd_inp.size()) { + // sample next token + const int top_k = params.top_k; + const float top_p = params.top_p; + const float temp = params.temp; + + const int n_vocab = model.hparams.n_vocab; + + gpt_vocab::id id = 0; + + { + const int64_t t_start_sample_us = ggml_time_us(); + + id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); + + t_sample_us += ggml_time_us() - t_start_sample_us; + } + + // add it to the context + embd.push_back(id); + } else { + // if here, it means we are still processing the input prompt + for (size_t k = i; k < embd_inp.size(); k++) { + embd.push_back(embd_inp[k]); + if (int32_t(embd.size()) >= params.n_batch) { + break; + } + } + i += embd.size() - 1; + } + + // display text + for (auto id : embd) { + printf("%s", vocab.id_to_token[id].c_str()); + } + fflush(stdout); + + // end of text token + if (embd.back() == 50256) { + break; + } + } + + // report timing + { + const int64_t t_main_end_us = ggml_time_us(); + + printf("\n\n"); + printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); + printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); + printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); + printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); + } + + ggml_free(model.ctx_w); + + ggml_backend_sched_free(sched); + ggml_backend_buffer_free(model.buffer_kv); + for (auto buf : model.buffers_w) { + ggml_backend_buffer_free(buf); + } + for (auto backend : model.backends) { + ggml_backend_free(backend); + } + + return 0; +} diff --git a/examples/gpt-2/main.cpp b/examples/gpt-2/main.cpp deleted file mode 100644 index 05ce370e..00000000 --- a/examples/gpt-2/main.cpp +++ /dev/null @@ -1,1080 +0,0 @@ -#include "ggml/ggml.h" -#include "ggml/ggml-alloc.h" -#include "ggml/ggml-backend.h" - -#ifdef GGML_USE_CUBLAS -#include "ggml-cuda.h" -#endif - -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif - -#include "common.h" -#include "common-ggml.h" - -#include -#include -#include -#include -#include -#include -#include -#include - -#if defined(_MSC_VER) -#pragma warning(disable: 4244 4267) // possible loss of data -#endif - -#define GPT2_MAX_NODES 4096 - -static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - fputs(text, stderr); - fflush(stderr); -} - -// default hparams (GPT-2 117M) -struct gpt2_hparams { - int32_t n_vocab = 50257; - int32_t n_ctx = 1024; - int32_t n_embd = 768; - int32_t n_head = 12; - int32_t n_layer = 12; - int32_t ftype = 1; - float eps = 1e-5f; -}; - -struct gpt2_layer { - // normalization - struct ggml_tensor * ln_1_g; - struct ggml_tensor * ln_1_b; - - struct ggml_tensor * ln_2_g; - struct ggml_tensor * ln_2_b; - - // attention - struct ggml_tensor * c_attn_attn_w; - struct ggml_tensor * c_attn_attn_b; - - struct ggml_tensor * c_attn_proj_w; - struct ggml_tensor * c_attn_proj_b; - - // mlp - struct ggml_tensor * c_mlp_fc_w; - struct ggml_tensor * c_mlp_fc_b; - - struct ggml_tensor * c_mlp_proj_w; - struct ggml_tensor * c_mlp_proj_b; -}; - -struct gpt2_model { - gpt2_hparams hparams; - - // normalization - struct ggml_tensor * ln_f_g; - struct ggml_tensor * ln_f_b; - - struct ggml_tensor * wte; // position embedding - struct ggml_tensor * wpe; // token embedding - struct ggml_tensor * lm_head; // language model head - - std::vector layers; - - // key + value memory - struct ggml_tensor * memory_k; - struct ggml_tensor * memory_v; - - // - struct ggml_context * ctx; - - std::vector backends; - std::vector buffers_w; - ggml_backend_buffer_t buffer_kv; - ggml_backend_buffer_t buffer_input; - - std::map tensors; - - // inputs/constants - struct ggml_tensor * embd; - struct ggml_tensor * position; -}; - -void init_backends(gpt2_model & model, const gpt_params & params) { - ggml_backend_t gpu_backend = NULL; - - // initialize the backends -#ifdef GGML_USE_CUBLAS - if (params.n_gpu_layers > 0) { - fprintf(stderr, "%s: using CUDA backend\n", __func__); - gpu_backend = ggml_backend_cuda_init(0); - if (!gpu_backend) { - fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); - } - } -#endif - -#ifdef GGML_USE_METAL - if (params.n_gpu_layers > 0) { - fprintf(stderr, "%s: using Metal backend\n", __func__); - ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr); - gpu_backend = ggml_backend_metal_init(); - if (!gpu_backend) { - fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); - } else { - ggml_backend_metal_set_n_cb(gpu_backend, params.n_threads); - } - } -#endif - if (gpu_backend) { - model.backends.push_back(gpu_backend); - } - - // always add the CPU backend as a fallback - ggml_backend_t cpu_backend = ggml_backend_cpu_init(); - ggml_backend_cpu_set_n_threads(cpu_backend, params.n_threads); - model.backends.push_back(cpu_backend); -} - -// load the model's weights from a file -bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, const gpt_params & params) { - printf("%s: loading model from '%s'\n", __func__, fname.c_str()); - - auto fin = std::ifstream(fname, std::ios::binary); - if (!fin) { - fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str()); - return false; - } - - // verify magic - { - uint32_t magic; - fin.read((char *) &magic, sizeof(magic)); - if (magic != GGML_FILE_MAGIC) { - fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str()); - return false; - } - } - - // load hparams - { - auto & hparams = model.hparams; - - fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab)); - fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx)); - fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd)); - fin.read((char *) &hparams.n_head, sizeof(hparams.n_head)); - fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer)); - fin.read((char *) &hparams.ftype, sizeof(hparams.ftype)); - - const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR; - - printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab); - printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx); - printf("%s: n_embd = %d\n", __func__, hparams.n_embd); - printf("%s: n_head = %d\n", __func__, hparams.n_head); - printf("%s: n_layer = %d\n", __func__, hparams.n_layer); - printf("%s: ftype = %d\n", __func__, hparams.ftype); - printf("%s: qntvr = %d\n", __func__, qntvr); - - hparams.ftype %= GGML_QNT_VERSION_FACTOR; - } - - // load vocab - { - int32_t n_vocab = 0; - fin.read((char *) &n_vocab, sizeof(n_vocab)); - - if (n_vocab != model.hparams.n_vocab) { - fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n", - __func__, fname.c_str(), n_vocab, model.hparams.n_vocab); - return false; - } - - std::string word; - std::vector buf(128); - - for (int i = 0; i < n_vocab; i++) { - uint32_t len; - fin.read((char *) &len, sizeof(len)); - - buf.resize(len); - fin.read((char *) buf.data(), len); - word.assign(buf.data(), len); - - vocab.token_to_id[word] = i; - vocab.id_to_token[i] = word; - } - } - - // for the big tensors, we have the option to store the data in 16-bit floats or quantized - // in order to save memory and also to speed up the computation - ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype)); - if (wtype == GGML_TYPE_COUNT) { - fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n", - __func__, fname.c_str(), model.hparams.ftype); - return false; - } - - auto & ctx = model.ctx; - - // create the ggml context - { - size_t n_tensors = 3 /* input */ + 2 /* kv */ + 6 + 12*model.hparams.n_layer; - struct ggml_init_params params = { - /*.mem_size =*/ ggml_tensor_overhead() * n_tensors, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - - model.ctx = ggml_init(params); - if (!model.ctx) { - fprintf(stderr, "%s: ggml_init() failed\n", __func__); - return false; - } - } - - // create tensors for the weights - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_vocab = hparams.n_vocab; - - model.layers.resize(n_layer); - - model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx); - model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab); - - // map by name - model.tensors["model/ln_f/g"] = model.ln_f_g; - model.tensors["model/ln_f/b"] = model.ln_f_b; - - model.tensors["model/wte"] = model.wte; - model.tensors["model/wpe"] = model.wpe; - model.tensors["model/lm_head"] = model.lm_head; - - for (int i = 0; i < n_layer; ++i) { - auto & layer = model.layers[i]; - - layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd); - layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd); - - layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd); - layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd); - layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd); - - layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd); - layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd); - - // map by name - model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g; - model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b; - - model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g; - model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b; - - model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w; - model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b; - - model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w; - model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b; - - model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w; - model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b; - - model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w; - model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b; - } - } - - // assign tensors to backends - init_backends(model, params); - ggml_backend_t backend_gpu = model.backends.front(); - ggml_backend_t backend_cpu = model.backends.back(); - std::map tensor_backends; - { - const int i_gpu_first_layer = model.hparams.n_layer - params.n_gpu_layers; - for (auto it : model.tensors) { - const std::string & name = it.first; - // input tensors - if (name == "model/wte" || name == "model/wpe") { - if (params.n_gpu_layers > model.hparams.n_layer) { - tensor_backends[name] = backend_gpu; - } else { - tensor_backends[name] = backend_cpu; - } - } - // output tensors - if (name == "model/ln_f/g" || name == "model/ln_f/b" || name == "model/lm_head") { - if (params.n_gpu_layers > 0) { - tensor_backends[name] = backend_gpu; - } else { - tensor_backends[name] = backend_cpu; - } - } - // layer tensors - if (name.substr(0, 7) == "model/h") { - // parse layer number - int layer = std::stoi(name.substr(7, 2)); - if (layer >= i_gpu_first_layer) { - tensor_backends[name] = backend_gpu; - } else { - tensor_backends[name] = backend_cpu; - } - } - } - } - - // allocate buffers - std::map> backend_buffers; - for (auto backend : model.backends) { - // compute the size of the buffer - size_t size = 0; - for (auto it : model.tensors) { - if (tensor_backends[it.first] == backend) { - size += ggml_nbytes(it.second) + 512; - } - } - if (size > 0) { - printf("%s: %8s buffer size = %8.2f MB\n", __func__, ggml_backend_name(backend), size/1024.0/1024.0); - // allocate the buffer - ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size); - model.buffers_w.push_back(buffer); - - // create an allocator for the buffer to allocate the tensors - auto alloc = std::unique_ptr(ggml_allocr_new_from_buffer(buffer), ggml_allocr_free); - backend_buffers.insert(std::make_pair(backend, std::move(alloc))); - } else { - model.buffers_w.push_back(NULL); - } - } - - // allocate key + value memory - { - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - - const int n_mem = n_layer*n_ctx; - const int n_elements = n_embd*n_mem; - - model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); - model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements); - - ggml_set_name(model.memory_k, "model/memory_k"); - ggml_set_name(model.memory_v, "model/memory_v"); - - const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v); - - printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem); - - // create a backend buffer (can be in host or device memory) - ggml_backend_t backend_kv = params.n_gpu_layers >= hparams.n_layer/2 ? backend_gpu : backend_cpu; - printf("%s: backend_kv = %s\n", __func__, ggml_backend_name(backend_kv)); - model.buffer_kv = ggml_backend_alloc_buffer(backend_kv, memory_size + 512*2); - - // allocate the tensors into the backend buffer - { - ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv); - - // this updates the pointers in the tensors to point to the correct location in the buffer - // this is necessary since the ggml_context is .no_alloc == true - // note that the buffer can actually be a device buffer, depending on the backend - ggml_allocr_alloc(alloc, model.memory_k); - ggml_allocr_alloc(alloc, model.memory_v); - - ggml_allocr_free(alloc); - } - } - - // load weights - { - size_t total_size = 0; - - bool has_lm_head = false; - - std::vector read_buf; - - while (true) { - int32_t n_dims; - int32_t length; - int32_t ttype; - - fin.read(reinterpret_cast(&n_dims), sizeof(n_dims)); - fin.read(reinterpret_cast(&length), sizeof(length)); - fin.read(reinterpret_cast(&ttype), sizeof(ttype)); - - if (fin.eof()) { - break; - } - - int32_t nelements = 1; - int32_t ne[2] = { 1, 1 }; - for (int i = 0; i < n_dims; ++i) { - fin.read(reinterpret_cast(&ne[i]), sizeof(ne[i])); - nelements *= ne[i]; - } - - std::string name(length, 0); - fin.read(&name[0], length); - - if (model.tensors.find(name) == model.tensors.end()) { - fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str()); - return false; - } - - auto tensor = model.tensors[name]; - ggml_set_name(tensor, name.c_str()); - if (ggml_nelements(tensor) != nelements) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str()); - return false; - } - - if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) { - fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n", - __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]); - return false; - } - - // for debugging - if (0) { - printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor)); - } - - const size_t bpe = ggml_type_size(ggml_type(ttype)); - - if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) { - fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n", - __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe); - return false; - } - - // allocate the tensor - ggml_backend_t backend = tensor_backends[name]; - ggml_allocr * alloc = backend_buffers.find(backend)->second.get(); - ggml_allocr_alloc(alloc, tensor); - //printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str()); - - if (ggml_backend_is_cpu(backend) -#ifdef GGML_USE_METAL - || ggml_backend_is_metal(backend) -#endif - ) { - // for the CPU and Metal backend, we can read directly into the tensor - fin.read(reinterpret_cast(tensor->data), ggml_nbytes(tensor)); - } else { - // read into a temporary buffer first, then copy to device memory - read_buf.resize(ggml_nbytes(tensor)); - fin.read(read_buf.data(), ggml_nbytes(tensor)); - ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor)); - } - - // GPT-2 models share the WTE tensor as the LM head - if (name == "model/wte" && has_lm_head == false) { - ggml_allocr_alloc(backend_buffers.find(tensor_backends["model/lm_head"])->second.get(), model.lm_head); - //printf("%s: [%5.5s] %s (copied)\n", __func__, ggml_backend_name(tensor_backends["model/lm_head"]), "model/lm_head"); - ggml_backend_tensor_copy(tensor, model.lm_head); - total_size += ggml_nbytes(model.lm_head); - } - - if (name == "model/lm_head") { - has_lm_head = true; - } - - total_size += ggml_nbytes(tensor); - } - printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0); - } - - fin.close(); - - // allocate input tensors - { - model.embd = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model.hparams.n_ctx); - model.position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model.hparams.n_ctx); - - ggml_set_name(model.embd, "in/embd"); - ggml_set_name(model.position, "in/position"); - - // add input tensors to cpu backend - size_t input_size = ggml_nbytes(model.embd) + ggml_nbytes(model.position); - - // FIXME: use cpu backend after sched impl - ggml_backend_t backend_input = params.n_gpu_layers >= model.hparams.n_layer ? backend_gpu : backend_cpu; - model.buffer_input = ggml_backend_alloc_buffer(backend_input, input_size + 512*3); - printf("%s: backend_in = %s (%zu bytes)\n", __func__, ggml_backend_name(backend_input), input_size); - - // allocate the tensors into the backend buffer - ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_input); - ggml_allocr_alloc(alloc, model.embd); - ggml_allocr_alloc(alloc, model.position); - ggml_allocr_free(alloc); - } - - return true; -} - -// build the computation graph -struct ggml_cgraph * gpt2_graph( - const gpt2_model & model, - const int n_past, - const std::vector & embd_inp) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_embd = hparams.n_embd; - const int n_layer = hparams.n_layer; - const int n_ctx = hparams.n_ctx; - const int n_head = hparams.n_head; - - // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data - static size_t buf_size = ggml_tensor_overhead()*GPT2_MAX_NODES + ggml_graph_overhead_custom(GPT2_MAX_NODES, false); - static std::vector buf(buf_size); - - struct ggml_init_params params = { - /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf.data(), - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() - }; - - struct ggml_context * ctx0 = ggml_init(params); - - struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false); - - struct ggml_tensor * embd = ggml_view_1d(ctx0, model.embd, N, 0); - - // TODO: avoid writing to tensors if we are only measuring the memory usage - // not critical, just a minor optimization - - //if (!ggml_allocr_is_measure(allocr)) { - //ggml_backend_tensor_set(embd, embd_inp.data(), 0, N*ggml_element_size(embd)); - ggml_backend_tensor_set(model.embd, embd_inp.data(), 0, N*ggml_element_size(embd)); // FIXME: cannot use the view here because it's not initialized yet (buffer not set), but we should - //} - //memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd)); - - struct ggml_tensor * position = ggml_view_1d(ctx0, model.position, N, 0); - //if (!ggml_allocr_is_measure(allocr)) { - for (int i = 0; i < N; ++i) { - int32_t v = n_past + i; - ggml_backend_tensor_set(model.position, &v, i*sizeof(int32_t), sizeof(v)); // FIXME: same - //((int32_t *) position->data)[i] = n_past + i; - } - //} - - const float KQ_scale = 1.0f/sqrtf(float(model.hparams.n_embd)/model.hparams.n_head); - - // wte + wpe - struct ggml_tensor * inpL = - ggml_add(ctx0, - ggml_get_rows(ctx0, model.wte, embd), - ggml_get_rows(ctx0, model.wpe, position)); - ggml_set_name(inpL, "inpL"); - ggml_set_name(inpL->src[0], "wte"); - ggml_set_name(inpL->src[1], "wpe"); - - for (int il = 0; il < n_layer; ++il) { - struct ggml_tensor * cur; - - // norm - { - // [ 768, N] - cur = ggml_norm(ctx0, inpL, hparams.eps); - ggml_format_name(cur, "l%d.norm", il); - - // cur = ln_1_g*cur + ln_1_b - // [ 768, N] - cur = ggml_add(ctx0, - ggml_mul(ctx0, - cur, - model.layers[il].ln_1_g), - model.layers[il].ln_1_b); - ggml_format_name(cur, "l%d.ln_1_b", il); - ggml_format_name(cur->src[0], "l%d.ln_1_g", il); - } - - // attn - // [2304, 768] - model.layers[il].c_attn_attn_w - // [2304, 1] - model.layers[il].c_attn_attn_b - // [ 768, N] - cur (in) - // [2304, N] - cur (out) - // - // cur = attn_w*cur + attn_b - // [2304, N] - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_attn_attn_w, - cur); - ggml_format_name(cur, "l%d.attn_w", il); - - cur = ggml_add(ctx0, - cur, - model.layers[il].c_attn_attn_b); - ggml_format_name(cur, "l%d.attn_b", il); - } - - // self-attention - { - struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd); - struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd); - struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd); - - ggml_format_name(Qcur, "l%d.Qcur", il); - ggml_format_name(Kcur, "l%d.Kcur", il); - ggml_format_name(Vcur, "l%d.Vcur", il); - - // store key and value to memory - if (N >= 1) { - struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past)); - struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past)); - - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k)); - ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v)); - } - - // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3) - // [64, N, 12] - struct ggml_tensor * Q = - ggml_permute(ctx0, - ggml_cpy(ctx0, - Qcur, - ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)), - 0, 2, 1, 3); - ggml_format_name(Q, "l%d.Q", il); - - // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3) - // [64, n_past + N, 12] - struct ggml_tensor * K = - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd), - n_embd/n_head, n_head, n_past + N), - 0, 2, 1, 3); - ggml_format_name(K, "l%d.K", il); - - // GG: flash attention - //struct ggml_tensor * V = - // ggml_cpy(ctx0, - // ggml_permute(ctx0, - // ggml_reshape_3d(ctx0, - // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), - // n_embd/n_head, n_head, n_past + N), - // 1, 2, 0, 3), - // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head)); - - //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true); - - // K * Q - // [n_past + N, N, 12] - struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q); - ggml_format_name(KQ, "l%d.KQ", il); - - // KQ_scaled = KQ / sqrt(n_embd/n_head) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale); - ggml_format_name(KQ_scaled, "l%d.KQ_scaled", il); - - // KQ_masked = mask_past(KQ_scaled) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past); - ggml_format_name(KQ_masked, "l%d.KQ_masked", il); - - // KQ = soft_max(KQ_masked) - // [n_past + N, N, 12] - struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked); - ggml_format_name(KQ_soft_max, "l%d.KQ_soft_max", il); - - // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous() - // [n_past + N, 64, 12] - struct ggml_tensor * V_trans = - ggml_cpy(ctx0, - ggml_permute(ctx0, - ggml_reshape_3d(ctx0, - ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd), - n_embd/n_head, n_head, n_past + N), - 1, 2, 0, 3), - ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head)); - ggml_format_name(V_trans, "l%d.V_trans", il); - - // KQV = transpose(V) * KQ_soft_max - // [64, N, 12] - struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max); - ggml_format_name(KQV, "l%d.KQV", il); - - // KQV_merged = KQV.permute(0, 2, 1, 3) - // [64, 12, N] - struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3); - ggml_format_name(KQV_merged, "l%d.KQV_merged", il); - - // cur = KQV_merged.contiguous().view(n_embd, N) - // [768, N] - cur = ggml_cpy(ctx0, - KQV_merged, - ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N)); - ggml_format_name(cur, "l%d.KQV_merged_contiguous", il); - } - - // projection - // [ 768, 768] - model.layers[il].c_attn_proj_w - // [ 768, 1] - model.layers[il].c_attn_proj_b - // [ 768, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - { - cur = ggml_mul_mat(ctx0, - model.layers[il].c_attn_proj_w, - cur); - ggml_format_name(cur, "l%d.attn_proj_w", il); - - cur = ggml_add(ctx0, - cur, - model.layers[il].c_attn_proj_b); - ggml_format_name(cur, "l%d.attn_proj_b", il); - } - - // add the input - cur = ggml_add(ctx0, cur, inpL); - ggml_format_name(cur, "l%d.add", il); - - struct ggml_tensor * inpFF = cur; - - // feed-forward network - { - // norm - { - cur = ggml_norm(ctx0, inpFF, hparams.eps); - ggml_format_name(cur, "l%d.FFnorm", il); - - // cur = ln_2_g*cur + ln_2_b - // [ 768, N] - cur = ggml_add(ctx0, - ggml_mul(ctx0, - cur, - model.layers[il].ln_2_g), - model.layers[il].ln_2_b); - ggml_format_name(cur, "l%d.ln_2_b", il); - ggml_format_name(cur->src[0], "l%d.ln_2_g", il); - } - - // fully connected - // [3072, 768] - model.layers[il].c_mlp_fc_w - // [3072, 1] - model.layers[il].c_mlp_fc_b - // [ 768, N] - cur (in) - // [3072, N] - cur (out) - // - // cur = fc_w*cur + fc_b - // [3072, N] - cur = ggml_mul_mat(ctx0, - model.layers[il].c_mlp_fc_w, - cur); - ggml_format_name(cur, "l%d.mlp_fc_w", il); - - cur = ggml_add(ctx0, - cur, - model.layers[il].c_mlp_fc_b); - ggml_format_name(cur, "l%d.mlp_fc_b", il); - - // GELU activation - // [3072, N] - cur = ggml_gelu(ctx0, cur); - ggml_format_name(cur, "l%d.gelu", il); - - // projection - // [ 768, 3072] - model.layers[il].c_mlp_proj_w - // [ 768, 1] - model.layers[il].c_mlp_proj_b - // [3072, N] - cur (in) - // [ 768, N] - cur (out) - // - // cur = proj_w*cur + proj_b - // [768, N] - cur = ggml_mul_mat(ctx0, - model.layers[il].c_mlp_proj_w, - cur); - ggml_format_name(cur, "l%d.mlp_proj_w", il); - - cur = ggml_add(ctx0, - cur, - model.layers[il].c_mlp_proj_b); - ggml_format_name(cur, "l%d.mlp_proj_b", il); - } - - // input for next layer - inpL = ggml_add(ctx0, cur, inpFF); - ggml_format_name(inpL, "l%d.add2", il); - } - - // norm - { - // [ 768, N] - inpL = ggml_norm(ctx0, inpL, hparams.eps); - ggml_format_name(inpL, "out_norm"); - - // inpL = ln_f_g*inpL + ln_f_b - // [ 768, N] - inpL = ggml_add(ctx0, - ggml_mul(ctx0, - inpL, - model.ln_f_g), - model.ln_f_b); - ggml_format_name(inpL, "out_ln_f_b"); - ggml_format_name(inpL->src[0], "out_ln_f_g"); - } - - // inpL = WTE * inpL - // [ 768, 50257] - model.lm_head - // [ 768, N] - inpL - inpL = ggml_mul_mat(ctx0, model.lm_head, inpL); - ggml_format_name(inpL, "out_lm_head"); - - // logits -> probs - //inpL = ggml_soft_max(ctx0, inpL); - - ggml_build_forward_expand(gf, inpL); - - ggml_free(ctx0); - - return gf; -} - -// evaluate the transformer -// -// - model: the model -// - allocr: ggml_allocr to use to allocate the compute buffer -// - n_threads: number of threads to use -// - n_past: the context size so far -// - embd_inp: the embeddings of the tokens in the context -// - embd_w: the predicted logits for the next token -// -bool gpt2_eval( - const gpt2_model & model, - ggml_backend_sched_t sched, - const int n_past, - const std::vector & embd_inp, - std::vector & embd_w) { - const int N = embd_inp.size(); - - const auto & hparams = model.hparams; - - const int n_vocab = hparams.n_vocab; - - struct ggml_cgraph * gf = gpt2_graph(model, n_past, embd_inp); - - // run the computation - ggml_backend_sched_graph_compute(sched, gf); - - //if (n_past%100 == 0) { - // ggml_graph_print (&gf); - // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot"); - //} - - // in this case, the output tensor is the last one in the graph - struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1]; - - //embd_w.resize(n_vocab*N); - //ggml_backend_tensor_get(inpL, embd_w.data(), 0, sizeof(float)*n_vocab*N); - - // return result just for the last token - embd_w.resize(n_vocab); - ggml_backend_tensor_get(inpL, embd_w.data(), (n_vocab*(N-1))*sizeof(float), sizeof(float)*n_vocab); - - return true; -} - -int main(int argc, char ** argv) { - ggml_time_init(); - - const int64_t t_main_start_us = ggml_time_us(); - - gpt_params params; - params.model = "models/gpt-2-117M/ggml-model.bin"; - - if (gpt_params_parse(argc, argv, params) == false) { - return 1; - } - - if (params.seed < 0) { - params.seed = time(NULL); - } - - printf("%s: seed = %d\n", __func__, params.seed); - - std::mt19937 rng(params.seed); - if (params.prompt.empty()) { - params.prompt = gpt_random_prompt(rng); - } - - int64_t t_load_us = 0; - - gpt_vocab vocab; - gpt2_model model; - - // load the model - { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt2_model_load(params.model, model, vocab, params)) { - fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str()); - return 1; - } - - t_load_us = ggml_time_us() - t_start_us; - - test_gpt_tokenizer(vocab, params.token_test); - } - - // create the backend scheduler - // the scheduler handles the allocation of the compute buffers and the scheduling of the computation between the different backends - ggml_backend_sched_t sched; - { - // initialize the scheduler - sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES); - - // create the worst case graph for memory usage estimation - int n_tokens = std::min(model.hparams.n_ctx, params.n_batch); - int n_past = model.hparams.n_ctx - n_tokens; - struct ggml_cgraph * gf = gpt2_graph(model, n_past, std::vector(n_tokens, 0)); - - ggml_backend_sched_init_measure(sched, gf); - - - // compute the required memory - size_t mem_size = 0; - for (size_t i = 0; i < model.backends.size(); i++) { - ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(sched, model.backends[i]); - size_t size = ggml_backend_buffer_get_size(buf); - if (size > 0) { - mem_size += size; - printf("%s: %8s compute buffer size = %8.2f MB\n", __func__, ggml_backend_name(model.backends[i]), size/1024.0/1024.0); - //printf("%s: %8s compute buffer size = %zu bytes\n", __func__, ggml_backend_name(model.backends[i]), size); - } - } - - printf("%s: total compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0); - } - - int n_past = 0; - - int64_t t_sample_us = 0; - int64_t t_predict_us = 0; - - std::vector logits; - - // tokenize the prompt - std::vector embd_inp = ::gpt_tokenize(vocab, params.prompt); - - params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size()); - - printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str()); - printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size()); - for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) { - printf("%d ", embd_inp[i]); - } - printf("\n\n"); - - // submit the input prompt token-by-token - // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning - std::vector embd; - - for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) { - // predict - if (embd.size() > 0) { - const int64_t t_start_us = ggml_time_us(); - - if (!gpt2_eval(model, sched, n_past, embd, logits)) { - printf("Failed to predict\n"); - return 1; - } - - t_predict_us += ggml_time_us() - t_start_us; - } - - n_past += embd.size(); - embd.clear(); - - if (i >= embd_inp.size()) { - // sample next token - const int top_k = params.top_k; - const float top_p = params.top_p; - const float temp = params.temp; - - const int n_vocab = model.hparams.n_vocab; - - gpt_vocab::id id = 0; - - { - const int64_t t_start_sample_us = ggml_time_us(); - - id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng); - - t_sample_us += ggml_time_us() - t_start_sample_us; - } - - // add it to the context - embd.push_back(id); - } else { - // if here, it means we are still processing the input prompt - for (size_t k = i; k < embd_inp.size(); k++) { - embd.push_back(embd_inp[k]); - if (int32_t(embd.size()) >= params.n_batch) { - break; - } - } - i += embd.size() - 1; - } - - // display text - for (auto id : embd) { - printf("%s", vocab.id_to_token[id].c_str()); - } - fflush(stdout); - - // end of text token - if (embd.back() == 50256) { - break; - } - } - - // report timing - { - const int64_t t_main_end_us = ggml_time_us(); - - printf("\n\n"); - printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f); - printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f); - printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past); - printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f); - } - - ggml_free(model.ctx); - - ggml_backend_sched_free(sched); - ggml_backend_buffer_free(model.buffer_kv); - for (auto & buf : model.buffers_w) { - ggml_backend_buffer_free(buf); - } - for (auto backend : model.backends) { - ggml_backend_free(backend); - } - - return 0; -} diff --git a/examples/sam/main.cpp b/examples/sam/main.cpp index d8dedf85..13d27b54 100644 --- a/examples/sam/main.cpp +++ b/examples/sam/main.cpp @@ -3,6 +3,7 @@ #include "ggml.h" #include "ggml-alloc.h" +#include "ggml-backend.h" #define STB_IMAGE_IMPLEMENTATION #include "stb_image.h" #define STB_IMAGE_WRITE_IMPLEMENTATION @@ -245,13 +246,11 @@ struct sam_state { // buffer for `ggml_graph_plan.work_data` std::vector work_buffer; // buffers to evaluate the model - std::vector buf_alloc_img_enc; std::vector buf_compute_img_enc; - std::vector buf_alloc_fast; std::vector buf_compute_fast; - struct ggml_allocr * allocr = {}; + ggml_gallocr_t allocr = {}; }; // void save_tensor(sam_state& state, struct ggml_tensor * t, struct ggml_cgraph * gf) { @@ -1116,24 +1115,11 @@ struct ggml_tensor * sam_fill_dense_pe( const auto & hparams = model.hparams; const auto & enc = model.enc_prompt; - const int32_t n_img_embd = hparams.n_img_embd(); - const float n_img_embd_inv = 1.0f / n_img_embd; + const int32_t n_img_embd = hparams.n_img_embd(); struct ggml_tensor * xy_embed_stacked = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 2, n_img_embd, n_img_embd); - ggml_allocr_alloc(state.allocr, xy_embed_stacked); - - if (!ggml_allocr_is_measure(state.allocr)) { - float * data = (float *) ggml_get_data(xy_embed_stacked); - for (int i = 0; i < n_img_embd; ++i) { - const int row = 2*i*n_img_embd; - const float y_val = 2 * (i + 0.5f) * n_img_embd_inv - 1; - for (int j = 0; j < n_img_embd; ++j) { - const float x_val = 2 * (j + 0.5f) * n_img_embd_inv - 1; - data[row + 2*j + 0] = x_val; - data[row + 2*j + 1] = y_val; - } - } - } + ggml_set_name(xy_embed_stacked, "xy_embed_stacked"); + ggml_set_input(xy_embed_stacked); struct ggml_tensor * cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, enc.pe)), xy_embed_stacked); @@ -1206,24 +1192,8 @@ struct ggml_cgraph * sam_encode_image( struct ggml_cgraph * gf = ggml_new_graph(ctx0); struct ggml_tensor * inp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_img_size, n_img_size, 3, 1); - ggml_allocr_alloc(state.allocr, inp); - if (!ggml_allocr_is_measure(state.allocr)) { - float * data = (float *) ggml_get_data(inp); - - const int nx = img.nx; - const int ny = img.ny; - const int n = nx*ny; - - GGML_ASSERT(nx == n_img_size && ny == n_img_size); - - for (int k = 0; k < 3; k++) { - for (int y = 0; y < ny; y++) { - for (int x = 0; x < nx; x++) { - data[k*n + y*nx + x] = img.data[3*(y*nx + x) + k]; - } - } - } - } + ggml_set_name(inp, "inp"); + ggml_set_input(inp); // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L392 struct ggml_tensor * cur = ggml_conv_2d_sk_p0(ctx0, enc.proj_w, inp); @@ -1393,6 +1363,27 @@ struct ggml_cgraph * sam_encode_image( ggml_free(ctx0); + ggml_gallocr_alloc_graph(state.allocr, gf); + + { + struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "inp"); + float * data = (float *) ggml_get_data(inp); + + const int nx = img.nx; + const int ny = img.ny; + const int n = nx*ny; + + GGML_ASSERT(nx == n_img_size && ny == n_img_size); + + for (int k = 0; k < 3; k++) { + for (int y = 0; y < ny; y++) { + for (int x = 0; x < nx; x++) { + data[k*n + y*nx + x] = img.data[3*(y*nx + x) + k]; + } + } + } + } + return gf; } @@ -1414,43 +1405,15 @@ prompt_encoder_result sam_encode_prompt( const sam_model & model, struct ggml_context * ctx0, struct ggml_cgraph * gf, - sam_state & state, - int nx, - int ny, - sam_point point) { + sam_state & state) { const auto & hparams = model.hparams; const auto & enc = model.enc_prompt; - // transform points - // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/automatic_mask_generator.py#L276 - { - const int nmax = std::max(nx, ny); - - const float scale = hparams.n_img_size() / (float) nmax; - - const int nx_new = int(nx*scale + 0.5f); - const int ny_new = int(ny*scale + 0.5f); - - point.x = point.x*(float(nx_new)/nx) + 0.5f; - point.y = point.y*(float(ny_new)/ny) + 0.5f; - } - struct ggml_tensor * inp = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2, 2); + ggml_set_name(inp, "prompt_input"); + ggml_set_input(inp); - ggml_allocr_alloc(state.allocr, inp); - if (!ggml_allocr_is_measure(state.allocr)) { - // set the input by converting the [0, 1] coordinates to [-1, 1] - float * data = (float *) inp->data; - - data[0] = 2.0f*(point.x / hparams.n_img_size()) - 1.0f; - data[1] = 2.0f*(point.y / hparams.n_img_size()) - 1.0f; - - // padding - // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L81-L85 - data[2] = 2.0f*(0.0f) - 1.0f; - data[3] = 2.0f*(0.0f) - 1.0f; - } struct ggml_tensor * cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, enc.pe)), inp); @@ -1757,7 +1720,6 @@ bool sam_decode_mask( { // ConvTranspose2d keys = ggml_conv_transpose_2d_p0(ctx0, dec.output_upscaling_0_w, keys, 2); - ggml_allocr_alloc(state.allocr, keys); // TODO: This alloc shouldn't be needed keys = ggml_add_inplace(ctx0, keys, ggml_repeat(ctx0, ggml_reshape_3d(ctx0, dec.output_upscaling_0_b, 1, 1, dec.output_upscaling_0_b->ne[0]), keys)); @@ -1769,7 +1731,6 @@ bool sam_decode_mask( // ConvTranspose2d keys = ggml_conv_transpose_2d_p0(ctx0, dec.output_upscaling_3_w, keys, 2); - ggml_allocr_alloc(state.allocr, keys); // TODO: This alloc shouldn't be needed keys = ggml_add_inplace(ctx0, ggml_repeat(ctx0, ggml_reshape_3d(ctx0, dec.output_upscaling_3_b, 1, 1, dec.output_upscaling_3_b->ne[0]), keys), keys); @@ -1986,7 +1947,7 @@ struct ggml_cgraph * sam_build_fast_graph( struct ggml_context * ctx0 = ggml_init(ggml_params); struct ggml_cgraph * gf = ggml_new_graph(ctx0); - prompt_encoder_result enc_res = sam_encode_prompt(model, ctx0, gf, state, nx, ny, point); + prompt_encoder_result enc_res = sam_encode_prompt(model, ctx0, gf, state); if (!enc_res.embd_prompt_sparse || !enc_res.embd_prompt_dense) { fprintf(stderr, "%s: failed to encode prompt (%f, %f)\n", __func__, point.x, point.y); return {}; @@ -2005,6 +1966,54 @@ struct ggml_cgraph * sam_build_fast_graph( ggml_free(ctx0); + ggml_gallocr_alloc_graph(state.allocr, gf); + + // from sam_encode_prompt + { + // transform points + // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/automatic_mask_generator.py#L276 + { + const int nmax = std::max(nx, ny); + + const float scale = model.hparams.n_img_size() / (float) nmax; + + const int nx_new = int(nx*scale + 0.5f); + const int ny_new = int(ny*scale + 0.5f); + + point.x = point.x*(float(nx_new)/nx) + 0.5f; + point.y = point.y*(float(ny_new)/ny) + 0.5f; + } + + struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "prompt_input"); + // set the input by converting the [0, 1] coordinates to [-1, 1] + float * data = (float *) inp->data; + + data[0] = 2.0f*(point.x / model.hparams.n_img_size()) - 1.0f; + data[1] = 2.0f*(point.y / model.hparams.n_img_size()) - 1.0f; + + // padding + // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L81-L85 + data[2] = 2.0f*(0.0f) - 1.0f; + data[3] = 2.0f*(0.0f) - 1.0f; + } + + // from sam_fill_dense_pe + { + struct ggml_tensor * xy_embed_stacked = ggml_graph_get_tensor(gf, "xy_embed_stacked"); + const int32_t n_img_embd = model.hparams.n_img_embd(); + const float n_img_embd_inv = 1.0f / n_img_embd; + float * data = (float *) ggml_get_data(xy_embed_stacked); + for (int i = 0; i < n_img_embd; ++i) { + const int row = 2*i*n_img_embd; + const float y_val = 2 * (i + 0.5f) * n_img_embd_inv - 1; + for (int j = 0; j < n_img_embd; ++j) { + const float x_val = 2 * (j + 0.5f) * n_img_embd_inv - 1; + data[row + 2*j + 0] = x_val; + data[row + 2*j + 1] = y_val; + } + } + } + return gf; } @@ -2164,25 +2173,9 @@ int main(int argc, char ** argv) { } - static const size_t tensor_alignment = 32; { state.buf_compute_img_enc.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead()); - state.allocr = ggml_allocr_new_measure(tensor_alignment); - struct ggml_cgraph * gf_measure = sam_encode_image(model, state, img1); - if (!gf_measure) { - fprintf(stderr, "%s: failed to encode image\n", __func__); - return 1; - } - - size_t alloc_size = ggml_allocr_alloc_graph(state.allocr, gf_measure) + tensor_alignment; - ggml_allocr_free(state.allocr); - - // recreate allocator with exact memory requirements - state.buf_alloc_img_enc.resize(alloc_size); - state.allocr = ggml_allocr_new(state.buf_alloc_img_enc.data(), state.buf_alloc_img_enc.size(), tensor_alignment); - - // compute the graph with the measured exact memory requirements from above - ggml_allocr_reset(state.allocr); + state.allocr = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); struct ggml_cgraph * gf = sam_encode_image(model, state, img1); if (!gf) { @@ -2190,53 +2183,32 @@ int main(int argc, char ** argv) { return 1; } - ggml_allocr_alloc_graph(state.allocr, gf); - ggml_graph_compute_helper(state.work_buffer, gf, params.n_threads); print_t_f32("embd_img", state.embd_img); - ggml_allocr_free(state.allocr); + ggml_gallocr_free(state.allocr); state.allocr = NULL; state.work_buffer.clear(); } { state.buf_compute_fast.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead()); - state.allocr = ggml_allocr_new_measure(tensor_alignment); + state.allocr = ggml_gallocr_new(ggml_backend_cpu_buffer_type()); // TODO: more varied prompts fprintf(stderr, "prompt: (%f, %f)\n", params.pt.x, params.pt.y); - // measure memory requirements for the graph - struct ggml_cgraph * gf_measure = sam_build_fast_graph(model, state, img0.nx, img0.ny, params.pt); - if (!gf_measure) { - fprintf(stderr, "%s: failed to build fast graph to measure\n", __func__); - return 1; - } - - size_t alloc_size = ggml_allocr_alloc_graph(state.allocr, gf_measure) + tensor_alignment; - ggml_allocr_free(state.allocr); - - // recreate allocator with exact memory requirements - state.buf_alloc_fast.resize(alloc_size); - state.allocr = ggml_allocr_new(state.buf_alloc_fast.data(), state.buf_alloc_fast.size(), tensor_alignment); - - // compute the graph with the measured exact memory requirements from above - ggml_allocr_reset(state.allocr); - struct ggml_cgraph * gf = sam_build_fast_graph(model, state, img0.nx, img0.ny, params.pt); if (!gf) { fprintf(stderr, "%s: failed to build fast graph\n", __func__); return 1; } - ggml_allocr_alloc_graph(state.allocr, gf); - ggml_graph_compute_helper(state.work_buffer, gf, params.n_threads); //print_t_f32("iou_predictions", state.iou_predictions); //print_t_f32("low_res_masks", state.low_res_masks); - ggml_allocr_free(state.allocr); + ggml_gallocr_free(state.allocr); state.allocr = NULL; } diff --git a/examples/whisper/whisper.cpp b/examples/whisper/whisper.cpp index 28e3804f..dec99570 100644 --- a/examples/whisper/whisper.cpp +++ b/examples/whisper/whisper.cpp @@ -471,52 +471,32 @@ struct whisper_pair { // ggml_allocr wrapper for whisper usage struct whisper_allocr { - ggml_allocr * alloc = nullptr; + ggml_gallocr_t alloc = nullptr; std::vector meta; - - ggml_backend_buffer_t buffer; }; static size_t whisper_allocr_size(struct whisper_allocr & allocr) { - return allocr.meta.size() + ggml_allocr_max_size(allocr.alloc); + return allocr.meta.size() + ggml_gallocr_get_buffer_size(allocr.alloc, 0); } // measure the memory usage of a graph and prepare the allocr's internal data buffer -static void whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function && get_graph) { +static bool whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function && get_graph) { auto & alloc = allocr.alloc; auto & meta = allocr.meta; - alloc = ggml_allocr_new_measure_from_backend(backend); + alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend)); meta.resize(ggml_tensor_overhead()*WHISPER_MAX_NODES + ggml_graph_overhead()); - ggml_allocr_alloc_graph(alloc, get_graph()); -} - -static void whisper_allocr_graph_realloc(struct whisper_allocr & allocr, ggml_backend_t backend) { - if (allocr.alloc == nullptr) { - // this can be null if we use external encoder like CoreML or OpenVINO - return; - } - - auto & alloc = allocr.alloc; - auto & buffer = allocr.buffer; - - size_t size = ggml_allocr_max_size(alloc); - - ggml_allocr_free(alloc); - - buffer = ggml_backend_alloc_buffer(backend, size); - alloc = ggml_allocr_new_from_buffer(buffer); -} - -static void whisper_allocr_free(struct whisper_allocr & allocr) { - if (allocr.alloc) { - ggml_allocr_free(allocr.alloc); - ggml_backend_buffer_free(allocr.buffer); - allocr.alloc = nullptr; + // since there are dependencies between the different graphs, + // we need to allocate them instead of only reserving to get the correct compute buffer size + if (!ggml_gallocr_alloc_graph(alloc, get_graph())) { + // failed to allocate the compute buffer + WHISPER_LOG_ERROR("%s: failed to allocate the compute buffer\n", __func__); + return false; } + return true; } // medium @@ -658,9 +638,9 @@ struct whisper_kv_cache { struct ggml_tensor * k; struct ggml_tensor * v; - struct ggml_context * ctx; + struct ggml_context * ctx = nullptr; - ggml_backend_buffer_t buffer; + ggml_backend_buffer_t buffer = nullptr; }; struct whisper_model { @@ -698,10 +678,10 @@ struct whisper_model { std::vector layers_decoder; // ggml context that contains all the meta information about the model tensors - struct ggml_context * ctx; + struct ggml_context * ctx = nullptr; // the model backend data is read-only and can be shared between processors - std::vector buffers; + ggml_backend_buffer_t buffer = nullptr; // tensors int n_loaded; @@ -903,36 +883,26 @@ static bool kv_cache_init( cache.ctx = ggml_init(params); if (!cache.ctx) { - WHISPER_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__); + WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache context\n", __func__); return false; } cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements); - const size_t mem_bytes = ggml_nbytes(cache.k) + ggml_nbytes(cache.v); - - cache.buffer = ggml_backend_alloc_buffer(backend, mem_bytes); - - // allocate the tensors into the backend buffer - { - ggml_allocr * alloc = ggml_allocr_new_from_buffer(cache.buffer); - - ggml_allocr_alloc(alloc, cache.k); - ggml_allocr_alloc(alloc, cache.v); - - ggml_allocr_free(alloc); + cache.buffer = ggml_backend_alloc_ctx_tensors(cache.ctx, backend); + if (!cache.buffer) { + WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache\n", __func__); + return false; } return true; } static void kv_cache_free(struct whisper_kv_cache & cache) { - if (cache.ctx) { - ggml_free(cache.ctx); - ggml_backend_buffer_free(cache.buffer); - cache.ctx = nullptr; - } + ggml_free(cache.ctx); + ggml_backend_buffer_free(cache.buffer); + cache.ctx = nullptr; } static bool whisper_kv_cache_find_slot( @@ -1513,68 +1483,21 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con } wctx.backend = whisper_backend_init(wctx.params); - - // some devices have a limit on the maximum size of single memory buffer - // for example, iPhones are limited to 1GB per buffer - // to workaround this, we will allocate multiple buffers of smaller size and will split the tensors with the - // model weights between them - // - // the map_t2b maps tensor names to buffer indices - // as we iterate over the tensors, we will allocate new buffers when the current one is full - // - // finally, we create a separate allocator for each buffer and use it to allocate the tensors - // we keep the allocators alive until all the tensors are loaded - - GGML_ASSERT(model.buffers.empty()); - - std::map map_t2b; - - { - size_t size_main = 0; - size_t size_cur = 0; - - static const size_t GB = 1024ull*1024ull*1024ull; - - for (const auto & t : model.tensors) { - const size_t cur = ggml_nbytes(t.second) + ggml_tensor_overhead(); - - // adding the tensor to the current buffer will exceed the limit, so we need to allocate a new buffer - if (size_cur + cur > GB) { - GGML_ASSERT(size_cur > 0 && "A tensor is too large to fit in a single buffer"); - - model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur)); - - size_cur = cur; - } - - map_t2b[t.first] = model.buffers.size(); - - size_cur += cur; - size_main += cur; - } - - // allocate the last buffer if needed - if (size_cur > 0) { - model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur)); - } - - GGML_ASSERT(model.buffers.size() > 0); - - WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB (%d buffers)\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6, (int) model.buffers.size()); - } - - std::vector allocs(model.buffers.size()); - for (size_t i = 0; i < allocs.size(); ++i) { - allocs[i] = ggml_allocr_new_from_buffer(model.buffers[i]); + if (!wctx.backend) { + WHISPER_LOG_ERROR("%s: failed to initialize the backend\n", __func__); + return false; } // allocate tensors in the backend buffers - { - for (const auto & t : model.tensors) { - ggml_allocr_alloc(allocs[map_t2b[t.first]], t.second); - } + model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, wctx.backend); + if (!model.buffer) { + WHISPER_LOG_ERROR("%s: failed to allocate memory for the model\n", __func__); + return false; } + size_t size_main = ggml_backend_buffer_get_size(model.buffer); + WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6); + // load weights { size_t total_size = 0; @@ -1636,15 +1559,11 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con return false; } - ggml_backend_t backend = wctx.backend; + //ggml_backend_t backend = wctx.backend; //printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str()); - if ((ggml_backend_is_cpu(backend) -#ifdef GGML_USE_METAL - || ggml_backend_is_metal(backend) -#endif - )) { + if (ggml_backend_buffer_is_host(model.buffer)) { // for the CPU and Metal backend, we can read directly into the tensor loader->read(loader->context, tensor->data, ggml_nbytes(tensor)); BYTESWAP_TENSOR(tensor); @@ -1672,10 +1591,6 @@ static bool whisper_model_load(struct whisper_model_loader * loader, whisper_con } } - for (auto & alloc : allocs) { - ggml_allocr_free(alloc); - } - wctx.t_load_us = ggml_time_us() - t_start_us; return true; @@ -1704,7 +1619,6 @@ static struct ggml_cgraph * whisper_build_graph_conv( whisper_state & wstate, const int mel_offset) { const auto & model = wctx.model; - const auto & mel_inp = wstate.mel; const auto & hparams = model.hparams; const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx; @@ -1722,31 +1636,9 @@ static struct ggml_cgraph * whisper_build_graph_conv( ggml_cgraph * gf = ggml_new_graph(ctx0); - ggml_allocr * alloc = wstate.alloc_conv.alloc; - struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels); - ggml_allocr_alloc(alloc, mel); - - assert(mel->type == GGML_TYPE_F32); - if (!ggml_allocr_is_measure(alloc)) { - assert(mel_inp.n_mel == n_mels); - - wstate.inp_mel.resize(ggml_nelements(mel)); - - float * dst = wstate.inp_mel.data(); - memset(dst, 0, ggml_nbytes(mel)); - - const int i0 = std::min(mel_offset, mel_inp.n_len); - const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len); - - for (int j = 0; j < mel_inp.n_mel; ++j) { - for (int i = i0; i < i1; ++i) { - dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i]; - } - } - - ggml_backend_tensor_set(mel, wstate.inp_mel.data(), 0, ggml_nelements(mel)*sizeof(float)); - } + ggml_set_name(mel, "mel"); + ggml_set_input(mel); struct ggml_tensor * cur = nullptr; @@ -2138,11 +2030,39 @@ static bool whisper_encode_internal( { auto & alloc = wstate.alloc_conv.alloc; - ggml_allocr_reset(alloc); - ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate, mel_offset); - ggml_allocr_alloc_graph(alloc, gf); + if (!ggml_gallocr_alloc_graph(alloc, gf)) { + // should never happen as we pre-allocate the memory + return false; + } + + // set the input + { + const auto & mel_inp = wstate.mel; + const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : wctx.model.hparams.n_audio_ctx; + + struct ggml_tensor * mel = ggml_graph_get_tensor(gf, "mel"); + + assert(mel->type == GGML_TYPE_F32); + assert(mel_inp.n_mel == wctx.model.hparams.n_mels); + + wstate.inp_mel.resize(ggml_nelements(mel)); + + float * dst = wstate.inp_mel.data(); + memset(dst, 0, ggml_nbytes(mel)); + + const int i0 = std::min(mel_offset, mel_inp.n_len); + const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len); + + for (int j = 0; j < mel_inp.n_mel; ++j) { + for (int i = i0; i < i1; ++i) { + dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i]; + } + } + + ggml_backend_tensor_set(mel, wstate.inp_mel.data(), 0, ggml_nelements(mel)*sizeof(float)); + } if (!whisper_encode_external(wstate)) { if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) { @@ -2155,11 +2075,12 @@ static bool whisper_encode_internal( if (!whisper_encode_external(wstate)) { auto & alloc = wstate.alloc_encode.alloc; - ggml_allocr_reset(alloc); - ggml_cgraph * gf = whisper_build_graph_encoder(wctx, wstate); - ggml_allocr_alloc_graph(alloc, gf); + if (!ggml_gallocr_alloc_graph(alloc, gf)) { + // should never happen as we pre-allocate the memory + return false; + } if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) { return false; @@ -2170,11 +2091,12 @@ static bool whisper_encode_internal( { auto & alloc = wstate.alloc_cross.alloc; - ggml_allocr_reset(alloc); - ggml_cgraph * gf = whisper_build_graph_cross(wctx, wstate); - ggml_allocr_alloc_graph(alloc, gf); + if (!ggml_gallocr_alloc_graph(alloc, gf)) { + // should never happen as we pre-allocate the memory + return false; + } if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) { return false; @@ -2190,7 +2112,8 @@ static bool whisper_encode_internal( static struct ggml_cgraph * whisper_build_graph_decoder( whisper_context & wctx, whisper_state & wstate, - const whisper_batch & batch) { + const whisper_batch & batch, + bool worst_case) { const auto & model = wctx.model; const auto & hparams = model.hparams; @@ -2198,8 +2121,6 @@ static struct ggml_cgraph * whisper_build_graph_decoder( WHISPER_ASSERT(!!kv_self.ctx); - ggml_allocr * alloc = wstate.alloc_decode.alloc; - const int n_ctx = kv_self.size; const int n_state = hparams.n_text_state; const int n_head = hparams.n_text_head; @@ -2208,8 +2129,8 @@ static struct ggml_cgraph * whisper_build_graph_decoder( const int n_tokens = batch.n_tokens; const int n_audio_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx; - const int32_t n_kv = ggml_allocr_is_measure(alloc) ? n_ctx : kv_self.n; - const int32_t kv_head = ggml_allocr_is_measure(alloc) ? n_ctx - n_tokens : kv_self.head; + const int32_t n_kv = worst_case ? n_ctx : kv_self.n; + const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head; //WHISPER_LOG_DEBUG("%s: n_past = %d, n_tokens = %d, n_audio_ctx = %d, n_ctx = %d\n", __func__, n_past, n_tokens, n_audio_ctx, n_ctx); @@ -2224,48 +2145,18 @@ static struct ggml_cgraph * whisper_build_graph_decoder( ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false); struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_allocr_alloc(alloc, embd); - - if (!ggml_allocr_is_measure(alloc)) { - ggml_backend_tensor_set(embd, batch.token, 0, n_tokens*ggml_element_size(embd)); - } + ggml_set_name(embd, "embd"); + ggml_set_input(embd); struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens); - ggml_allocr_alloc(alloc, position); - - if (!ggml_allocr_is_measure(alloc)) { - for (int i = 0; i < n_tokens; ++i) { - const int32_t val = batch.pos[i]; - ggml_backend_tensor_set(position, &val, i*sizeof(int32_t), sizeof(int32_t)); - } - } + ggml_set_name(position, "position"); + ggml_set_input(position); const float KQscale = pow(float(n_state)/n_head, -0.25); struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1); - ggml_allocr_alloc(alloc, KQ_mask); - - if (!ggml_allocr_is_measure(alloc)) { - wstate.inp_mask.resize(n_kv*n_tokens); - - float * data = wstate.inp_mask.data(); - memset(data, 0, ggml_nbytes(KQ_mask)); - - for (int h = 0; h < 1; ++h) { - for (int j = 0; j < n_tokens; ++j) { - const whisper_pos pos = batch.pos[j]; - const whisper_seq_id seq_id = batch.seq_id[j][0]; - - for (int i = 0; i < n_kv; ++i) { - if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { - data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; - } - } - } - } - - ggml_backend_tensor_set(KQ_mask, wstate.inp_mask.data(), 0, ggml_nelements(KQ_mask)*sizeof(float)); - } + ggml_set_name(KQ_mask, "KQ_mask"); + ggml_set_input(KQ_mask); // token encoding + position encoding struct ggml_tensor * cur = @@ -2592,11 +2483,53 @@ static bool whisper_decode_internal( { auto & alloc = wstate.alloc_decode.alloc; - ggml_allocr_reset(alloc); + ggml_cgraph * gf = whisper_build_graph_decoder(wctx, wstate, batch, false); - ggml_cgraph * gf = whisper_build_graph_decoder(wctx, wstate, batch); + if (!ggml_gallocr_alloc_graph(alloc, gf)) { + // should never happen as we pre-allocate the memory + return false; + } - ggml_allocr_alloc_graph(alloc, gf); + // set the inputs + { + struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd"); + ggml_backend_tensor_set(embd, batch.token, 0, n_tokens*ggml_element_size(embd)); + } + + { + struct ggml_tensor * position = ggml_graph_get_tensor(gf, "position"); + for (int i = 0; i < n_tokens; ++i) { + const int32_t val = batch.pos[i]; + ggml_backend_tensor_set(position, &val, i*sizeof(int32_t), sizeof(int32_t)); + } + } + + { + struct ggml_tensor * KQ_mask = ggml_graph_get_tensor(gf, "KQ_mask"); + + auto & kv_self = wstate.kv_self; + const int32_t n_kv = kv_self.n; + + wstate.inp_mask.resize(n_kv*n_tokens); + + float * data = wstate.inp_mask.data(); + memset(data, 0, ggml_nbytes(KQ_mask)); + + for (int h = 0; h < 1; ++h) { + for (int j = 0; j < n_tokens; ++j) { + const whisper_pos pos = batch.pos[j]; + const whisper_seq_id seq_id = batch.seq_id[j][0]; + + for (int i = 0; i < n_kv; ++i) { + if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) { + data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY; + } + } + } + } + + ggml_backend_tensor_set(KQ_mask, wstate.inp_mask.data(), 0, ggml_nelements(KQ_mask)*sizeof(float)); + } logits = gf->nodes[gf->n_nodes - 1]; @@ -3046,6 +2979,11 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { whisper_state * state = new whisper_state; state->backend = whisper_backend_init(ctx->params); + if (!state->backend) { + WHISPER_LOG_ERROR("%s: whisper_backend_init() failed\n", __func__); + whisper_free_state(state); + return nullptr; + } // at this point, we don't know yet how many decoders will be used, so we overallocate 3x ctx // in theory, there can be a case where this is not enough, but in practice it should always be enough @@ -3053,7 +2991,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { if (!kv_cache_init(ctx->model.hparams, state->kv_self, ctx->backend, ctx->itype, factor*ctx->model.hparams.n_text_ctx)) { WHISPER_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__); - delete state; + whisper_free_state(state); return nullptr; } @@ -3064,7 +3002,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { if (!kv_cache_init(ctx->model.hparams, state->kv_cross, ctx->backend, ctx->itype, ctx->model.hparams.n_audio_ctx)) { WHISPER_LOG_ERROR("%s: kv_cache_init() failed for cross-attention cache\n", __func__); - delete state; + whisper_free_state(state); return nullptr; } @@ -3083,7 +3021,7 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { if (!state->ctx_coreml) { WHISPER_LOG_ERROR("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str()); #ifndef WHISPER_COREML_ALLOW_FALLBACK - delete state; + whisper_free_state(state); return nullptr; #endif } else { @@ -3107,37 +3045,55 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { // conv allocator { - whisper_allocr_graph_init(state->alloc_conv, ctx->backend, + bool ok = whisper_allocr_graph_init(state->alloc_conv, ctx->backend, [&]() { return whisper_build_graph_conv(*ctx, *state, 0); }); + if (!ok) { + WHISPER_LOG_ERROR("%s: failed to init conv allocator\n", __func__); + whisper_free_state(state); + return nullptr; + } + WHISPER_LOG_INFO("%s: compute buffer (conv) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_conv) / 1e6); } // encoder allocator if (!whisper_encode_external(*state)) { - whisper_allocr_graph_init(state->alloc_encode, ctx->backend, + bool ok = whisper_allocr_graph_init(state->alloc_encode, ctx->backend, [&]() { return whisper_build_graph_encoder(*ctx, *state); }); + if (!ok) { + WHISPER_LOG_ERROR("%s: failed to init encoder allocator\n", __func__); + whisper_free_state(state); + return nullptr; + } + WHISPER_LOG_INFO("%s: compute buffer (encode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_encode) / 1e6); } // cross allocator { - whisper_allocr_graph_init(state->alloc_cross, ctx->backend, + bool ok = whisper_allocr_graph_init(state->alloc_cross, ctx->backend, [&]() { return whisper_build_graph_cross(*ctx, *state); }); + if (!ok) { + WHISPER_LOG_ERROR("%s: failed to init cross allocator\n", __func__); + whisper_free_state(state); + return nullptr; + } + WHISPER_LOG_INFO("%s: compute buffer (cross) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_cross) / 1e6); } // decoder allocator { - whisper_allocr_graph_init(state->alloc_decode, ctx->backend, + bool ok = whisper_allocr_graph_init(state->alloc_decode, ctx->backend, [&]() { const auto & hparams = ctx->model.hparams; @@ -3147,17 +3103,18 @@ struct whisper_state * whisper_init_state(whisper_context * ctx) { whisper_batch_prep_legacy(state->batch, nullptr, n_tokens, n_past, 0); - return whisper_build_graph_decoder(*ctx, *state, state->batch); + return whisper_build_graph_decoder(*ctx, *state, state->batch, true); }); + if (!ok) { + WHISPER_LOG_ERROR("%s: failed to init decoder allocator\n", __func__); + whisper_free_state(state); + return nullptr; + } + WHISPER_LOG_INFO("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_decode) / 1e6); } - whisper_allocr_graph_realloc(state->alloc_conv, ctx->backend); - whisper_allocr_graph_realloc(state->alloc_encode, ctx->backend); - whisper_allocr_graph_realloc(state->alloc_cross, ctx->backend); - whisper_allocr_graph_realloc(state->alloc_decode, ctx->backend); - return state; } @@ -3380,8 +3337,7 @@ struct whisper_context * whisper_init_no_state(struct whisper_model_loader * loa return whisper_init_with_params_no_state(loader, whisper_context_default_params()); } -void whisper_free_state(struct whisper_state * state) -{ +void whisper_free_state(struct whisper_state * state) { if (state) { kv_cache_free(state->kv_self); kv_cache_free(state->kv_cross); @@ -3402,10 +3358,10 @@ void whisper_free_state(struct whisper_state * state) whisper_batch_free(state->batch); - whisper_allocr_free(state->alloc_conv); - whisper_allocr_free(state->alloc_encode); - whisper_allocr_free(state->alloc_cross); - whisper_allocr_free(state->alloc_decode); + ggml_gallocr_free(state->alloc_conv.alloc); + ggml_gallocr_free(state->alloc_encode.alloc); + ggml_gallocr_free(state->alloc_cross.alloc); + ggml_gallocr_free(state->alloc_decode.alloc); ggml_backend_free(state->backend); @@ -3415,15 +3371,9 @@ void whisper_free_state(struct whisper_state * state) void whisper_free(struct whisper_context * ctx) { if (ctx) { - if (ctx->model.ctx) { - ggml_free(ctx->model.ctx); - } + ggml_free(ctx->model.ctx); - for (auto & buffer : ctx->model.buffers) { - if (buffer) { - ggml_backend_buffer_free(buffer); - } - } + ggml_backend_buffer_free(ctx->model.buffer); whisper_free_state(ctx->state); diff --git a/include/ggml/ggml-alloc.h b/include/ggml/ggml-alloc.h index 4e599752..1d9085d1 100644 --- a/include/ggml/ggml-alloc.h +++ b/include/ggml/ggml-alloc.h @@ -6,88 +6,62 @@ extern "C" { #endif -struct ggml_backend; -struct ggml_backend_buffer; -struct ggml_backend_buffer_type; +typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t; +typedef struct ggml_backend_buffer * ggml_backend_buffer_t; +typedef struct ggml_backend * ggml_backend_t; -// -// Legacy API -// - -typedef struct ggml_allocr * ggml_allocr_t; - -// initialize allocator for use with CPU backend only -GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment); -GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment); - -// initialize allocator for use with ggml-backend -GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer); -GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer -GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend); - -GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc); - -// tell the allocator to parse nodes following the order described in the list -// you should call this if your graph are optimized to execute out-of-order -GGML_API void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n); - -GGML_API void ggml_allocr_free (ggml_allocr_t alloc); -GGML_API bool ggml_allocr_is_measure (ggml_allocr_t alloc); -GGML_API void ggml_allocr_reset (ggml_allocr_t alloc); -GGML_API void ggml_allocr_alloc (ggml_allocr_t alloc, struct ggml_tensor * tensor); -GGML_API size_t ggml_allocr_max_size (ggml_allocr_t alloc); - -GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph); +// Tensor allocator +typedef struct ggml_tallocr * ggml_tallocr_t; -// -// ggml-backend v2 API -// +GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer); +GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc); +GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor); -// Separate tensor and graph allocator objects -// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators -// The original API is kept as a wrapper around the new API +// Graph allocator +/* + Example usage: + ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type()); -// Tensor allocator -typedef struct ggml_tallocr * ggml_tallocr_t; + // optional: create a worst-case graph and reserve the buffers to avoid reallocations + ggml_gallocr_reserve(galloc, build_graph(max_batch)); -GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment); -GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment); -GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size); -GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer -GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer); -GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft); -GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend); + // allocate the graph + struct ggml_cgraph * graph = build_graph(batch); + ggml_gallocr_alloc_graph(galloc, graph); -GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc); + printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0)); -GGML_API void ggml_tallocr_free (ggml_tallocr_t talloc); -GGML_API bool ggml_tallocr_is_measure (ggml_tallocr_t talloc); -GGML_API void ggml_tallocr_reset (ggml_tallocr_t talloc); -GGML_API void ggml_tallocr_alloc (ggml_tallocr_t talloc, struct ggml_tensor * tensor); -GGML_API size_t ggml_tallocr_max_size (ggml_tallocr_t talloc); + // evaluate the graph + ggml_backend_graph_compute(backend, graph); +*/ +// special tensor flags for use with the graph allocator: +// ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses +// ggml_set_output(): output tensors are never freed and never overwritten -// Graph allocator typedef struct ggml_gallocr * ggml_gallocr_t; -GGML_API ggml_gallocr_t ggml_gallocr_new(void); -GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); +GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft); +GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs); +GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc); -GGML_API void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n); -GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph); +// pre-allocate buffers from a measure graph - does not allocate or modify the graph +// call with a worst-case graph to avoid buffer reallocations +// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed +// returns false if the buffer allocation failed +GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph); +GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids); -// Allocate tensors from the allocators given by the hash table -GGML_API void ggml_gallocr_alloc_graph_n( - ggml_gallocr_t galloc, - struct ggml_cgraph * graph, - struct ggml_hash_set hash_set, - ggml_tallocr_t * hash_node_talloc); +// automatic reallocation if the topology changes when using a single buffer +// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers) +GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph); +GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id); // Utils // Create a buffer and allocate all the tensors in a ggml_context -GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft); -GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend); +GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft); +GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend); #ifdef __cplusplus } diff --git a/include/ggml/ggml-backend.h b/include/ggml/ggml-backend.h index 282b3a9b..f13c69bf 100644 --- a/include/ggml/ggml-backend.h +++ b/include/ggml/ggml-backend.h @@ -130,11 +130,7 @@ extern "C" { // in build_graph: build_graph(...) { - // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer) - alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu); - ggml_allocr_alloc(alloc_cpu, tensor); - - // manually assigning nodes to a backend (optional, shouldn't be needed in most cases) + // manually assign nodes to a backend (optional, should not be needed in most cases) struct ggml_tensor * node = ggml_mul_mat(ctx, ...); ggml_backend_sched_set_node_backend(sched, node, backend_gpu); } @@ -164,20 +160,19 @@ extern "C" { GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size); GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched); // Initialize backend buffers from a measure graph - GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); + GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph); // Get the number of splits of the last graph GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched); - GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend); - GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend); + GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend); GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend); GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node); // Allocate and compute graph on the backend scheduler - GGML_API void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); + GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph); - // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs + // Reset all assignments and allocators - must be called before changing the node backends GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched); // Set a callback to be called for each resulting node during graph compute diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h index 1360cd8e..51309947 100644 --- a/include/ggml/ggml.h +++ b/include/ggml/ggml.h @@ -505,11 +505,17 @@ extern "C" { enum ggml_log_level { GGML_LOG_LEVEL_ERROR = 2, - GGML_LOG_LEVEL_WARN = 3, - GGML_LOG_LEVEL_INFO = 4, + GGML_LOG_LEVEL_WARN = 3, + GGML_LOG_LEVEL_INFO = 4, GGML_LOG_LEVEL_DEBUG = 5 }; + enum ggml_tensor_flag { + GGML_TENSOR_FLAG_INPUT = 1, + GGML_TENSOR_FLAG_OUTPUT = 2, + GGML_TENSOR_FLAG_PARAM = 4, + }; + // ggml object struct ggml_object { size_t offs; @@ -543,7 +549,7 @@ extern "C" { // op params - allocated as int32_t for alignment int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)]; - bool is_param; + int32_t flags; struct ggml_tensor * grad; struct ggml_tensor * src[GGML_MAX_SRC]; @@ -2092,6 +2098,12 @@ extern "C" { ggml_opt_callback callback, void * callback_data); + // + // tensor flags + // + GGML_API void ggml_set_input(struct ggml_tensor * tensor); + GGML_API void ggml_set_output(struct ggml_tensor * tensor); + // // quantization // diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c index f9be6e1c..c28c37c4 100644 --- a/src/ggml-alloc.c +++ b/src/ggml-alloc.c @@ -17,6 +17,50 @@ //#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__) #define AT_PRINTF(...) + +static bool ggml_is_view(const struct ggml_tensor * t) { + return t->view_src != NULL; +} + +static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { + if (a->type != b->type) { + return false; + } + for (int i = 0; i < GGML_MAX_DIMS; i++) { + if (a->ne[i] != b->ne[i]) { + return false; + } + if (a->nb[i] != b->nb[i]) { + return false; + } + } + return true; +} + +static bool ggml_op_can_inplace(enum ggml_op op) { + switch (op) { + case GGML_OP_SCALE: + case GGML_OP_DIAG_MASK_ZERO: + case GGML_OP_DIAG_MASK_INF: + case GGML_OP_ADD: + case GGML_OP_ADD1: + case GGML_OP_SUB: + case GGML_OP_MUL: + case GGML_OP_DIV: + case GGML_OP_SQR: + case GGML_OP_SQRT: + case GGML_OP_LOG: + case GGML_OP_UNARY: + case GGML_OP_ROPE: + case GGML_OP_RMS_NORM: + case GGML_OP_SOFT_MAX: + return true; + + default: + return false; + } +} + // TODO: GGML_PAD ? static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) { assert(alignment && !(alignment & (alignment - 1))); // power of 2 @@ -24,66 +68,102 @@ static size_t aligned_offset(const void * buffer, size_t offset, size_t alignmen return offset + align; } +// tallocr +struct ggml_tallocr { + ggml_backend_buffer_t buffer; + void * base; + size_t alignment; + size_t offset; +}; + +ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) { + ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr)); + if (talloc == NULL) { + return NULL; + } + + void * base = ggml_backend_buffer_get_base(buffer); + size_t align = ggml_backend_buffer_get_alignment(buffer); + + assert(align && !(align & (align - 1))); // power of 2 + + *talloc = (struct ggml_tallocr) { + /*.buffer = */ buffer, + /*.base = */ base, + /*.alignment = */ align, + /*.offset = */ aligned_offset(base, 0, align), + }; + return talloc; +} + +void ggml_tallocr_free(ggml_tallocr_t talloc) { + free(talloc); +} + +void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) { + size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor); + size = GGML_PAD(size, talloc->alignment); + + if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) { + fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n", + __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset); + GGML_ASSERT(!"not enough space in the buffer"); + return; + } + + void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset; + talloc->offset += size; + + assert(((uintptr_t)addr % talloc->alignment) == 0); + + ggml_backend_tensor_alloc(talloc->buffer, tensor, addr); +} + +// dynamic tensor allocator + struct free_block { - void * addr; + size_t offset; size_t size; }; -struct ggml_tallocr { - struct ggml_backend_buffer * buffer; - bool buffer_owned; - void * base; +struct ggml_dyn_tallocr { size_t alignment; - int n_free_blocks; struct free_block free_blocks[MAX_FREE_BLOCKS]; - size_t max_size; - bool measure; - #ifdef GGML_ALLOCATOR_DEBUG - struct ggml_tensor * allocated_tensors[1024]; + struct { + const struct ggml_tensor * tensor; + size_t offset; + } allocated_tensors[1024]; #endif }; #ifdef GGML_ALLOCATOR_DEBUG -static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { +static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) { for (int i = 0; i < 1024; i++) { - if (alloc->allocated_tensors[i] == NULL) { - alloc->allocated_tensors[i] = tensor; + if (alloc->allocated_tensors[i].tensor == NULL) { + alloc->allocated_tensors[i].tensor = tensor; + alloc->allocated_tensors[i].offset = offset; return; } } GGML_ASSERT(!"out of allocated_tensors"); } -static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { +static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) { for (int i = 0; i < 1024; i++) { - if (alloc->allocated_tensors[i] == tensor || - (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) { - alloc->allocated_tensors[i] = NULL; + if (alloc->allocated_tensors[i].offset == offset) { + alloc->allocated_tensors[i].tensor = NULL; return; } } - printf("tried to free tensor %s not found\n", tensor->name); + fprintf(stderr, "tried to free tensor %s not found\n", tensor->name); GGML_ASSERT(!"tensor not found"); } #endif -// check if a tensor is allocated by this buffer -static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) { - return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer); -} - -static bool ggml_is_view(struct ggml_tensor * t) { - return t->view_src != NULL; -} - -void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { - GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources - GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated - - size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor); +static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) { size = aligned_offset(NULL, size, alloc->alignment); AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size); @@ -109,16 +189,17 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { if (block->size >= size) { best_fit_block = alloc->n_free_blocks - 1; } else { - fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n", - __func__, tensor->name, size, max_avail); + // this should never happen + fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n", + __func__, size, max_avail); GGML_ASSERT(!"not enough space in the buffer"); - return; + GGML_UNREACHABLE(); } } struct free_block * block = &alloc->free_blocks[best_fit_block]; - void * addr = block->addr; - block->addr = (char*)block->addr + size; + size_t offset = block->offset; + block->offset = offset + size; block->size -= size; if (block->size == 0) { // remove block if empty @@ -128,59 +209,63 @@ void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { } } - AT_PRINTF("block %d, addr %p\n", best_fit_block, addr); - - tensor->data = addr; - tensor->buffer = alloc->buffer; - if (!alloc->measure) { - ggml_backend_buffer_init_tensor(alloc->buffer, tensor); - } + AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset); #ifdef GGML_ALLOCATOR_DEBUG - add_allocated_tensor(alloc, tensor); - size_t cur_max = (char*)addr - (char*)alloc->base + size; + add_allocated_tensor(alloc, offset, tensor); + size_t cur_max = offset + size; if (cur_max > alloc->max_size) { - printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); + // sort allocated_tensors by offset + for (int i = 0; i < 1024; i++) { + for (int j = i + 1; j < 1024; j++) { + if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) { + const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor; + size_t tmp_offset = alloc->allocated_tensors[i].offset; + alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor; + alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset; + alloc->allocated_tensors[j].tensor = tmp_tensor; + alloc->allocated_tensors[j].offset = tmp_offset; + } + } + } + fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0); for (int i = 0; i < 1024; i++) { - if (alloc->allocated_tensors[i]) { - printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0); + if (alloc->allocated_tensors[i].tensor) { + fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name, + alloc->allocated_tensors[i].offset, + alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor), + ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0); } } - printf("\n"); + fprintf(stderr, "\n"); } #endif - alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size); -} + alloc->max_size = MAX(alloc->max_size, offset + size); -// this is a very naive implementation, but for our case the number of free blocks should be very small -static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) { - if (ggml_tallocr_is_own(alloc, tensor) == false) { - // the tensor was not allocated in this buffer - // this can happen because the graph allocator will try to free weights and other tensors from different buffers - // the easiest way to deal with this is just to ignore it - // AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer); - return; - } + return offset; - void * ptr = tensor->data; + GGML_UNUSED(tensor); +} - size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor); +// this is a very naive implementation, but for our case the number of free blocks should be very small +static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) { size = aligned_offset(NULL, size, alloc->alignment); - AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks); + + AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks); #ifdef GGML_ALLOCATOR_DEBUG - remove_allocated_tensor(alloc, tensor); + remove_allocated_tensor(alloc, offset, tensor); #endif // see if we can merge with an existing block for (int i = 0; i < alloc->n_free_blocks; i++) { struct free_block * block = &alloc->free_blocks[i]; // check if ptr is at the end of the block - if ((char*)block->addr + block->size == ptr) { + if (block->offset + block->size == offset) { block->size += size; // check if we can merge with the next block - if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) { + if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) { block->size += alloc->free_blocks[i+1].size; alloc->n_free_blocks--; for (int j = i+1; j < alloc->n_free_blocks; j++) { @@ -190,11 +275,11 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * return; } // check if ptr is at the beginning of the block - if ((char*)ptr + size == block->addr) { - block->addr = ptr; + if (offset + size == block->offset) { + block->offset = offset; block->size += size; // check if we can merge with the previous block - if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) { + if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) { alloc->free_blocks[i-1].size += block->size; alloc->n_free_blocks--; for (int j = i; j < alloc->n_free_blocks; j++) { @@ -208,7 +293,7 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks"); // insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster) int insert_pos = 0; - while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) { + while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) { insert_pos++; } // shift all blocks from insert_pos onward to make room for the new block @@ -216,337 +301,271 @@ static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * alloc->free_blocks[i] = alloc->free_blocks[i-1]; } // insert the new block - alloc->free_blocks[insert_pos].addr = ptr; + alloc->free_blocks[insert_pos].offset = offset; alloc->free_blocks[insert_pos].size = size; alloc->n_free_blocks++; + + GGML_UNUSED(tensor); } -void ggml_tallocr_reset(ggml_tallocr_t alloc) { +static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) { alloc->n_free_blocks = 1; - size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment); - alloc->free_blocks[0].addr = (char *)alloc->base + align_offset; - - if (alloc->measure) { - alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows - } else { - alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset; - ggml_backend_buffer_reset(alloc->buffer); - } + alloc->free_blocks[0].offset = 0; + alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows + alloc->max_size = 0; } -ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) { - struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size); - - ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr)); +static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) { + struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr)); - *alloc = (struct ggml_tallocr) { - /*.buffer = */ buffer, - /*.buffer_owned = */ true, - /*.base = */ ggml_backend_buffer_get_base(buffer), + *alloc = (struct ggml_dyn_tallocr) { /*.alignment = */ alignment, /*.n_free_blocks = */ 0, /*.free_blocks = */ {{0}}, /*.max_size = */ 0, - /*.measure = */ false, #ifdef GGML_ALLOCATOR_DEBUG - /*.allocated_tensors = */ {0}, + /*.allocated_tensors = */ {{0}}, #endif }; - ggml_tallocr_reset(alloc); - - return alloc; -} - -ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) { - ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment); - alloc->measure = true; + ggml_dyn_tallocr_reset(alloc); return alloc; } -ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) { - // create a backend buffer to get the correct tensor allocation sizes - ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1); - - // TODO: move alloc initialization to a common ggml_tallocr_new_impl function - ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer); - alloc->buffer_owned = true; - alloc->measure = true; - ggml_tallocr_reset(alloc); - return alloc; -} - -ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) { - return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend)); -} - -ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) { - // create a backend buffer to get the correct tensor allocation sizes - ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size); - ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer); - alloc->buffer_owned = true; - return alloc; -} - -ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) { - return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size); -} - -ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) { - ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr)); - - *alloc = (struct ggml_tallocr) { - /*.buffer = */ buffer, - /*.buffer_owned = */ false, - /*.base = */ ggml_backend_buffer_get_base(buffer), - /*.alignment = */ ggml_backend_buffer_get_alignment(buffer), - /*.n_free_blocks = */ 0, - /*.free_blocks = */ {{0}}, - /*.max_size = */ 0, - /*.measure = */ false, -#ifdef GGML_ALLOCATOR_DEBUG - /*.allocated_tensors = */ {0}, -#endif - }; - - ggml_tallocr_reset(alloc); - - return alloc; -} - -struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) { - return alloc->buffer; -} - -void ggml_tallocr_free(ggml_tallocr_t alloc) { - if (alloc == NULL) { - return; - } - - if (alloc->buffer_owned) { - ggml_backend_buffer_free(alloc->buffer); - } +static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) { free(alloc); } -bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) { - return alloc->measure; +static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) { + return alloc->max_size; } -size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) { - // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail - // to avoid this, we add a 10% margin to the buffer size - return alloc->max_size + alloc->max_size/10; -} + +///////////////////////////////////// // graph allocator struct hash_node { int n_children; int n_views; + int buffer_id; + size_t offset; // offset within the buffer + bool allocated; +}; + +// +struct tensor_alloc { + size_t offset; + size_t size_max; // 0 = pre-allocated, unused, or view +}; + +struct node_alloc { + int buffer_id; + struct tensor_alloc dst; + struct tensor_alloc src[GGML_MAX_SRC]; }; struct ggml_gallocr { - ggml_tallocr_t talloc; + ggml_backend_buffer_type_t * bufts; // [n_buffers] + ggml_backend_buffer_t * buffers; // [n_buffers] + struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers] + int n_buffers; + struct ggml_hash_set hash_set; - struct hash_node * hash_values; - size_t hash_values_size; - ggml_tallocr_t * hash_allocs; - int * parse_seq; - int parse_seq_len; + struct hash_node * hash_values; // [hash_set.size] + + struct node_alloc * node_allocs; // [n_nodes] + int n_nodes; }; -ggml_gallocr_t ggml_gallocr_new(void) { - ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr)); - - *galloc = (struct ggml_gallocr) { - /*.talloc = */ NULL, - /*.hash_set = */ {0}, - /*.hash_values = */ NULL, - /*.hash_values_size = */ 0, - /*.hash_allocs = */ NULL, - /*.parse_seq = */ NULL, - /*.parse_seq_len = */ 0, - }; +ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) { + ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1); + GGML_ASSERT(galloc != NULL); + + galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1); + GGML_ASSERT(galloc->bufts != NULL); + + galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1); + GGML_ASSERT(galloc->buffers != NULL); + + galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1); + GGML_ASSERT(galloc->buf_tallocs != NULL); + + for (int i = 0; i < n_bufs; i++) { + galloc->bufts[i] = bufts[i]; + galloc->buffers[i] = NULL; + size_t alignment = ggml_backend_buft_get_alignment(bufts[i]); + galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment); + } + galloc->n_buffers = n_bufs; return galloc; } +ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) { + return ggml_gallocr_new_n(&buft, 1); +} + void ggml_gallocr_free(ggml_gallocr_t galloc) { if (galloc == NULL) { return; } - if (galloc->hash_set.keys != NULL) { - free(galloc->hash_set.keys); - } - if (galloc->hash_values != NULL) { - free(galloc->hash_values); - } - if (galloc->hash_allocs != NULL) { - free(galloc->hash_allocs); - } - if (galloc->parse_seq != NULL) { - free(galloc->parse_seq); + for (int i = 0; i < galloc->n_buffers; i++) { + if (galloc->buffers != NULL) { + ggml_backend_buffer_free(galloc->buffers[i]); + } + if (galloc->buf_tallocs != NULL) { + ggml_dyn_tallocr_free(galloc->buf_tallocs[i]); + } } + + free(galloc->hash_set.keys); + free(galloc->hash_values); + free(galloc->bufts); + free(galloc->buffers); + free(galloc->buf_tallocs); + free(galloc->node_allocs); free(galloc); } -void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) { - free(galloc->parse_seq); - galloc->parse_seq = malloc(sizeof(int) * n); +typedef struct ggml_gallocr * ggml_gallocr_t; - for (int i = 0; i < n; i++) { - galloc->parse_seq[i] = list[i]; - } - galloc->parse_seq_len = n; -} - -static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) { +static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) { size_t i = ggml_hash_find_or_insert(galloc->hash_set, t); return &galloc->hash_values[i]; } -static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) { - if (a->type != b->type) { - return false; - } - for (int i = 0; i < GGML_MAX_DIMS; i++) { - if (a->ne[i] != b->ne[i]) { - return false; - } - if (a->nb[i] != b->nb[i]) { - return false; - } - } - return true; +static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) { + return ggml_gallocr_hash_get(galloc, t)->allocated; } -static bool ggml_op_can_inplace(enum ggml_op op) { - switch (op) { - case GGML_OP_SCALE: - case GGML_OP_DIAG_MASK_ZERO: - case GGML_OP_DIAG_MASK_INF: - case GGML_OP_ADD: - case GGML_OP_ADD1: - case GGML_OP_SUB: - case GGML_OP_MUL: - case GGML_OP_DIV: - case GGML_OP_SQR: - case GGML_OP_SQRT: - case GGML_OP_LOG: - case GGML_OP_UNARY: - case GGML_OP_ROPE: - case GGML_OP_RMS_NORM: - case GGML_OP_SOFT_MAX: - return true; - - default: - return false; - } +static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) { + struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); + hn->buffer_id = buffer_id; + hn->offset = offset; + hn->allocated = true; } -static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * node) { - if (galloc->talloc != NULL) { - return galloc->talloc; - } - - return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)]; +static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) { + return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated; } -static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) { - ggml_tallocr_t alloc = node_tallocr(galloc, view); - - GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL); - if (update_backend) { - view->backend = view->view_src->backend; - } - // views are initialized in the alloc buffer rather than the view_src buffer - view->buffer = alloc->buffer; - view->data = (char *)view->view_src->data + view->view_offs; +static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) { + struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); - assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft); + if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) { + hn->allocated = true; + assert(hn->offset == 0); - if (!alloc->measure) { - ggml_backend_buffer_init_tensor(alloc->buffer, view); - } -} + // try to reuse a parent's buffer (inplace) + if (ggml_op_can_inplace(node->op)) { + for (int i = 0; i < GGML_MAX_SRC; i++) { + struct ggml_tensor * parent = node->src[i]; + if (parent == NULL) { + break; + } -static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) { - ggml_tallocr_t alloc = node_tallocr(galloc, node); + // if the node's data is external, then we cannot re-use it + if (!ggml_gallocr_is_own(galloc, parent)) { + AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); + continue; + } - if (node->data == NULL) { - if (ggml_is_view(node)) { - init_view(galloc, node, true); - } else { - // see if we can reuse a parent's buffer (inplace) - if (ggml_op_can_inplace(node->op)) { - for (int i = 0; i < GGML_MAX_SRC; i++) { - struct ggml_tensor * parent = node->src[i]; - if (parent == NULL) { - break; - } + // outputs cannot be reused + if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) { + AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name); + continue; + } - // if the node's data is external, then we cannot re-use it - if (ggml_tallocr_is_own(alloc, parent) == false) { - AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data); - continue; - } + if (!ggml_are_same_layout(node, parent)) { + AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name); + continue; + } - struct hash_node * p_hn = hash_get(galloc, parent); - if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) { - if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = parent->view_src; - struct hash_node * view_src_hn = hash_get(galloc, view_src); - if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { - // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite - // the parent's data that it will need later (same layout requirement). the problem is that then - // we cannot free the tensor because the original address of the allocation is lost. - // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views - // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data) - AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); - node->view_src = view_src; - view_src_hn->n_views += 1; - init_view(galloc, node, false); - return; - } - } else { - AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); - node->view_src = parent; - p_hn->n_views += 1; - init_view(galloc, node, false); + struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent); + if (p_hn->n_children == 1 && p_hn->n_views == 0) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = parent->view_src; + struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src); + if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) { + AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name); + assert(view_src_hn->offset == p_hn->offset); + hn->buffer_id = p_hn->buffer_id; + hn->offset = p_hn->offset; + p_hn->allocated = false; // avoid freeing the parent + view_src_hn->allocated = false; return; } + } else { + AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name); + hn->buffer_id = p_hn->buffer_id; + hn->offset = p_hn->offset; + p_hn->allocated = false; // avoid freeing the parent + return; } } } - ggml_tallocr_alloc(alloc, node); } + // allocate tensor from the buffer + struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; + ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; + size_t size = ggml_backend_buft_get_alloc_size(buft, node); + size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node); + hn->buffer_id = buffer_id; + hn->offset = offset; + return; } } -static void free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) { - ggml_tallocr_t alloc = node_tallocr(galloc, node); +static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) { + // graph outputs are never freed + if (node->flags & GGML_TENSOR_FLAG_OUTPUT) { + AT_PRINTF("not freeing output %s\n", node->name); + return; + } - ggml_tallocr_free_tensor(alloc, node); + struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id]; + ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id]; + struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); + size_t offset = hn->offset; + size_t size = ggml_backend_buft_get_alloc_size(buft, node); + ggml_dyn_tallocr_free_tensor(alloc, offset, size, node); + hn->allocated = false; } -static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * gf) { - const int * parse_seq = galloc->parse_seq; - int parse_seq_len = galloc->parse_seq_len; +static int get_node_buffer_id(const int * node_buffer_ids, int i) { + return node_buffer_ids ? node_buffer_ids[i] : 0; +} + +static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) { + // clear hash tables + memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *)); + memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node)); + + // allocate all graph inputs first to avoid overwriting them + for (int i = 0; i < graph->n_nodes; i++) { + if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) { + ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i)); + } + for (int j = 0; j < GGML_MAX_SRC; j++) { + if (graph->nodes[i]->src[j] == NULL) { + break; + } + if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) { + ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i)); + } + } + } // count number of children and views - for (int i = 0; i < gf->n_nodes; i++) { - struct ggml_tensor * node = gf->nodes[i]; + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view(node)) { struct ggml_tensor * view_src = node->view_src; - hash_get(galloc, view_src)->n_views += 1; - if (node->buffer == NULL && node->data != NULL) { - // view of a pre-allocated tensor, didn't call init_view() yet - init_view(galloc, node, true); - } + ggml_gallocr_hash_get(galloc, view_src)->n_views += 1; } for (int j = 0; j < GGML_MAX_SRC; j++) { @@ -554,227 +573,283 @@ static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr if (parent == NULL) { break; } - hash_get(galloc, parent)->n_children += 1; - if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) { - init_view(galloc, parent, true); - } + ggml_gallocr_hash_get(galloc, parent)->n_children += 1; } } // allocate tensors - // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers - int last_barrier_pos = 0; - int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes; - - for (int ind = 0; ind < n_nodes; ind++) { - // allocate a node if there is no parse_seq or this is not a barrier - if (parse_seq_len == 0 || parse_seq[ind] != -1) { - int i = parse_seq_len ? parse_seq[ind] : ind; - struct ggml_tensor * node = gf->nodes[i]; - - // allocate parents (leafs) - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - allocate_node(galloc, parent); + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + int buffer_id = get_node_buffer_id(node_buffer_ids, i); + + // allocate parents (only leafs need to be allocated at this point) + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; } + ggml_gallocr_allocate_node(galloc, parent, buffer_id); + } - // allocate node - allocate_node(galloc, node); + // allocate node + ggml_gallocr_allocate_node(galloc, node, buffer_id); - AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name); - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - AT_PRINTF("%s", parent->name); - if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { - AT_PRINTF(", "); - } + AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name); + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + AT_PRINTF("%s", parent->name); + if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) { + AT_PRINTF(", "); } - AT_PRINTF("\n"); } + AT_PRINTF("\n"); // update parents - // update immediately if there is no parse_seq - // update only at barriers if there is parse_seq - if ((parse_seq_len == 0) || parse_seq[ind] == -1) { - int update_start = parse_seq_len ? last_barrier_pos : ind; - int update_end = parse_seq_len ? ind : ind + 1; - for (int i = update_start; i < update_end; i++) { - int node_i = parse_seq_len ? parse_seq[i] : i; - struct ggml_tensor * node = gf->nodes[node_i]; - - for (int j = 0; j < GGML_MAX_SRC; j++) { - struct ggml_tensor * parent = node->src[j]; - if (parent == NULL) { - break; - } - struct hash_node * p_hn = hash_get(galloc, parent); - p_hn->n_children -= 1; - - //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views); - - if (p_hn->n_children == 0 && p_hn->n_views == 0) { - if (ggml_is_view(parent)) { - struct ggml_tensor * view_src = parent->view_src; - struct hash_node * view_src_hn = hash_get(galloc, view_src); - view_src_hn->n_views -= 1; - AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views); - if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) { - free_node(galloc, view_src); - } - } - else { - free_node(galloc, parent); - } + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * parent = node->src[j]; + if (parent == NULL) { + break; + } + struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent); + p_hn->n_children -= 1; + + AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n", + parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated); + + if (p_hn->n_children == 0 && p_hn->n_views == 0) { + if (ggml_is_view(parent)) { + struct ggml_tensor * view_src = parent->view_src; + struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src); + view_src_hn->n_views -= 1; + AT_PRINTF("view_src %s: %d children, %d views\n", + view_src->name, view_src_hn->n_children, view_src_hn->n_views); + if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) { + ggml_gallocr_free_node(galloc, view_src, buffer_id); } } + else if (p_hn->allocated) { + ggml_gallocr_free_node(galloc, parent, buffer_id); + } } AT_PRINTF("\n"); - if (parse_seq_len) { - last_barrier_pos = ind + 1; - } } } } -size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph) { +bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) { size_t hash_size = graph->visited_hash_table.size; - // check if the hash table is initialized and large enough + // initialize hash table if (galloc->hash_set.size < hash_size) { - if (galloc->hash_set.keys != NULL) { - free(galloc->hash_set.keys); - } - if (galloc->hash_values != NULL) { - free(galloc->hash_values); - } - galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size); + free(galloc->hash_set.keys); + free(galloc->hash_values); galloc->hash_set.size = hash_size; - galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size); + galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size); + galloc->hash_values = calloc(sizeof(struct hash_node), hash_size); + GGML_ASSERT(galloc->hash_set.keys != NULL); + GGML_ASSERT(galloc->hash_values != NULL); + } else { + // reset hash table + memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size); + memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size); } - // reset hash table - memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * hash_size); - memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size); - - galloc->talloc = talloc; - ggml_tallocr_alloc_graph_impl(galloc, graph); - galloc->talloc = NULL; - - size_t max_size = ggml_tallocr_max_size(talloc); - - return max_size; -} - -void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) { - const size_t hash_size = hash_set.size; - - GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs)); + // reset allocators + for (int i = 0; i < galloc->n_buffers; i++) { + ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]); + } - galloc->talloc = NULL; + // allocate in hash table + ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids); - // alloc hash_values if needed - if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) { - free(galloc->hash_values); - galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size); - galloc->hash_values_size = hash_size; + // set the node_allocs from the hash table + if (galloc->n_nodes < graph->n_nodes) { + free(galloc->node_allocs); + galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes); + GGML_ASSERT(galloc->node_allocs != NULL); } - - // free hash_set.keys if needed - if (galloc->hash_set.keys != NULL) { - free(galloc->hash_set.keys); + galloc->n_nodes = graph->n_nodes; + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + struct node_alloc * node_alloc = &galloc->node_allocs[i]; + node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i); + if (node->view_src || node->data) { + node_alloc->dst.offset = SIZE_MAX; + node_alloc->dst.size_max = 0; + } else { + struct hash_node * hn = ggml_gallocr_hash_get(galloc, node); + node_alloc->dst.offset = hn->offset; + node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node); + } + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (!src || src->view_src || src->data) { + node_alloc->src[j].offset = SIZE_MAX; + node_alloc->src[j].size_max = 0; + } else { + struct hash_node * hn = ggml_gallocr_hash_get(galloc, src); + node_alloc->src[j].offset = hn->offset; + node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src); + } + } } - galloc->hash_set = hash_set; - // reset hash values - memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size); + // reallocate buffers if needed + for (int i = 0; i < galloc->n_buffers; i++) { + size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0; + size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]); - galloc->hash_allocs = hash_node_talloc; - - ggml_tallocr_alloc_graph_impl(galloc, graph); + if (new_size > cur_size) { +#ifndef NDEBUG + fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0); +#endif + ggml_backend_buffer_free(galloc->buffers[i]); + galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size); + if (galloc->buffers[i] == NULL) { + fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size); + return false; + } + } + } - // remove unowned resources - galloc->hash_set.keys = NULL; - galloc->hash_allocs = NULL; + return true; } -// legacy API wrapper - -struct ggml_allocr { - ggml_tallocr_t talloc; - ggml_gallocr_t galloc; -}; - -static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) { - ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr)); - *alloc = (struct ggml_allocr) { - /*.talloc = */ talloc, - /*.galloc = */ ggml_gallocr_new(), - }; - return alloc; +bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) { + return ggml_gallocr_reserve_n(galloc, graph, NULL); } -ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment) { - return ggml_allocr_new_impl(ggml_tallocr_new(data, size, alignment)); -} +static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) { + assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max); -ggml_allocr_t ggml_allocr_new_measure(size_t alignment) { - return ggml_allocr_new_impl(ggml_tallocr_new_measure(alignment)); -} + if (node->view_src != NULL) { + if (node->buffer == NULL) { + assert(tensor_alloc->offset == SIZE_MAX); + if (node->view_src->buffer == NULL) { + // this tensor was allocated without ggml-backend + return; + } + ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node); + } + } else { + if (node->data == NULL) { + assert(tensor_alloc->offset != SIZE_MAX); + assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max); + void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]); + void * addr = (char *)base + tensor_alloc->offset; + ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr); + } else { + if (node->buffer == NULL) { + // this tensor was allocated without ggml-backend + return; + } -ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) { - return ggml_allocr_new_impl(ggml_tallocr_new_from_buffer(buffer)); +#ifndef NDEBUG + size_t offset = + (char *)node->data - + (char *)ggml_backend_buffer_get_base(node->buffer); + size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node); + assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset); + assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max); +#endif + } + } } -ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size) { - return ggml_allocr_new_impl(ggml_tallocr_new_from_backend(backend, size)); +static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) { + ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id]; + size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node); + return talloc->size_max >= node_size; } -ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend) { - return ggml_allocr_new_impl(ggml_tallocr_new_measure_from_backend(backend)); -} +static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { + if (galloc->n_nodes != graph->n_nodes) { +#ifndef NDEBUG + fprintf(stderr, "%s: graph has different number of nodes\n", __func__); +#endif + return true; + } -struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc) { - return ggml_tallocr_get_buffer(alloc->talloc); -} + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + struct node_alloc * node_alloc = &galloc->node_allocs[i]; -void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) { - ggml_gallocr_set_parse_seq(alloc->galloc, list, n); -} + if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) { +#ifndef NDEBUG + fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name); +#endif + return true; + } -void ggml_allocr_free(ggml_allocr_t alloc) { - if (alloc == NULL) { - return; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) { +#ifndef NDEBUG + fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name); +#endif + return true; + } + } } - ggml_gallocr_free(alloc->galloc); - ggml_tallocr_free(alloc->talloc); - free(alloc); + return false; } -bool ggml_allocr_is_measure(ggml_allocr_t alloc) { - return ggml_tallocr_is_measure(alloc->talloc); -} +bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) { + if (ggml_gallocr_needs_realloc(galloc, graph)) { + if (galloc->n_buffers == 1) { +#ifndef NDEBUG + fprintf(stderr, "%s: reallocating buffers automatically\n", __func__); +#endif + if (!ggml_gallocr_reserve(galloc, graph)) { + return false; + } + } else { +#ifndef NDEBUG + fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__); +#endif + return false; + } + } -void ggml_allocr_reset(ggml_allocr_t alloc) { - ggml_tallocr_reset(alloc->talloc); -} + // reset buffers + for (int i = 0; i < galloc->n_buffers; i++) { + // zero size buffers are not allocated + if (galloc->buffers[i] != NULL) { + ggml_backend_buffer_reset(galloc->buffers[i]); + } + } -void ggml_allocr_alloc(ggml_allocr_t alloc, struct ggml_tensor * tensor) { - ggml_tallocr_alloc(alloc->talloc, tensor); -} + // allocate the graph tensors from the previous assignments + for (int i = 0; i < graph->n_nodes; i++) { + struct ggml_tensor * node = graph->nodes[i]; + struct node_alloc * node_alloc = &galloc->node_allocs[i]; + for (int j = 0; j < GGML_MAX_SRC; j++) { + struct ggml_tensor * src = node->src[j]; + if (src == NULL) { + break; + } + ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]); + } + ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst); + } -size_t ggml_allocr_max_size(ggml_allocr_t alloc) { - return ggml_tallocr_max_size(alloc->talloc); + return true; } -size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) { - return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph); +size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) { + GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers); + + if (galloc->buffers[buffer_id] == NULL) { + return 0; + } + return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]); } // utils @@ -795,17 +870,17 @@ static bool alloc_tensor_range(struct ggml_context * ctx, return false; } - ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer); + struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer); for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) { if (t->data == NULL) { if (t->view_src == NULL) { ggml_tallocr_alloc(tallocr, t); - } else { + } else if (t->buffer == NULL) { ggml_backend_view_init(buffer, t); } } else { - if (t->view_src != NULL) { + if (t->view_src != NULL && t->buffer == NULL) { // view of a pre-allocated tensor ggml_backend_view_init(buffer, t); } @@ -838,7 +913,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } if (this_size > max_size) { - // tensor is too large to fit in a single buffer fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n", __func__, t->name, ggml_backend_buft_name(buft), @@ -870,7 +944,6 @@ ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_conte } if (n_buffers == 0) { - // all the tensors in the context are already allocated #ifndef NDEBUG fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__); #endif diff --git a/src/ggml-backend.c b/src/ggml-backend.c index 532da8ed..c0d89d31 100644 --- a/src/ggml-backend.c +++ b/src/ggml-backend.c @@ -475,6 +475,8 @@ ggml_backend_buffer_t ggml_backend_reg_alloc_buffer(size_t i, size_t size) { // backend CPU +static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 + GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) { return "CPU"; @@ -482,7 +484,14 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t } GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) { - return (void *)buffer->context; + uintptr_t data = (uintptr_t)buffer->context; + + // align the buffer + if (data % TENSOR_ALIGNMENT != 0) { + data = GGML_PAD(data, TENSOR_ALIGNMENT); + } + + return (void *)data; } GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) { @@ -540,8 +549,6 @@ static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = { /* .reset = */ NULL, }; -static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512 - GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) { return "CPU"; @@ -550,9 +557,11 @@ GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) { size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned - void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC? - - GGML_ASSERT(data != NULL && "failed to allocate buffer"); + void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h) + if (data == NULL) { + fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size); + return NULL; + } return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size); } @@ -766,6 +775,9 @@ static struct ggml_backend_i cpu_backend_i = { ggml_backend_t ggml_backend_cpu_init(void) { struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context)); + if (ctx == NULL) { + return NULL; + } ctx->n_threads = GGML_DEFAULT_N_THREADS; ctx->work_data = NULL; @@ -774,6 +786,10 @@ ggml_backend_t ggml_backend_cpu_init(void) { ctx->abort_callback_data = NULL; ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend)); + if (cpu_backend == NULL) { + free(ctx); + return NULL; + } *cpu_backend = (struct ggml_backend) { /* .interface = */ cpu_backend_i, @@ -865,6 +881,8 @@ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_back ctx->n_buffers = n_buffers; ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t)); + GGML_ASSERT(ctx->buffers != NULL); + size_t total_size = 0; for (size_t i = 0; i < n_buffers; i++) { ctx->buffers[i] = buffers[i]; @@ -886,6 +904,18 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, } } +// creates a copy of the tensor with the same memory layout +static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) { + struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor); + for (int i = 0; i < GGML_MAX_DIMS; i++) { + dup->nb[i] = tensor->nb[i]; + } + return dup; +} + +static bool ggml_is_view_op(enum ggml_op op) { + return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE; +} // scheduler @@ -894,7 +924,7 @@ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, #define GGML_MAX_SPLIT_INPUTS 16 struct ggml_backend_sched_split { - ggml_tallocr_t tallocr; + int backend_id; int i_start; int i_end; struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS]; @@ -909,15 +939,17 @@ struct ggml_backend_sched { int n_backends; ggml_backend_t backends[GGML_MAX_BACKENDS]; ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS]; - ggml_tallocr_t tallocs[GGML_MAX_BACKENDS]; ggml_gallocr_t galloc; // hash keys of the nodes in the graph struct ggml_hash_set hash_set; - // hash values (arrays of [hash_set.size]) - ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend) - struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend + // hash values + int * tensor_backend_id; + struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS]; + + int * node_backend_ids; // [n_nodes] + int n_nodes; // copy of the graph with modified inputs struct ggml_cgraph * graph; @@ -927,77 +959,46 @@ struct ggml_backend_sched { struct ggml_context * ctx; + ggml_backend_sched_eval_callback callback_eval; + void * callback_eval_user_data; + // align context_buffer to GGML_MEM_ALIGN #ifdef _MSC_VER __declspec(align(GGML_MEM_ALIGN)) #else __attribute__((aligned(GGML_MEM_ALIGN))) #endif - char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)]; - - ggml_backend_sched_eval_callback callback_eval; - void * callback_eval_user_data; + char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)]; }; #define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node) -#define node_allocr(node) sched->node_talloc[hash_id(node)] - -static bool ggml_is_view_op(enum ggml_op op) { - return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE; -} +#define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)] +#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)]) -// returns the priority of the backend, lower is better -static int sched_backend_prio(ggml_backend_sched_t sched, ggml_backend_t backend) { +// returns the priority of the backend, lower id is higher priority +static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) { for (int i = 0; i < sched->n_backends; i++) { if (sched->backends[i] == backend) { return i; } } - return INT_MAX; + return -1; } -static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) { - for (int i = 0; i < sched->n_backends; i++) { - if (sched->tallocs[i] == allocr) { - return i; - } - } - return INT_MAX; -} - -static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) { +static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) { if (buffer == NULL) { - return NULL; - } - - // check if this is already allocate in a allocr buffer (from user manual allocations) - for (int i = 0; i < sched->n_backends; i++) { - if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) { - return sched->tallocs[i]; - } + return -1; } // find highest prio backend that supports the buffer type for (int i = 0; i < sched->n_backends; i++) { if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) { - return sched->tallocs[i]; + return i; } } GGML_ASSERT(false && "tensor buffer type not supported by any backend"); } -static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_tallocr_t allocr) { - if (allocr == NULL) { - return NULL; - } - for (int i = 0; i < sched->n_backends; i++) { - if (sched->tallocs[i] == allocr) { - return sched->backends[i]; - } - } - GGML_UNREACHABLE(); -} - #if 0 static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only #define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__) @@ -1008,37 +1009,39 @@ static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_I #endif // returns the backend that should be used for the node based on the current locations -static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) { +static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) { + // TODO: use supports_op to check if the backend supports the op + // assign pre-allocated nodes to their backend // dst - ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer); - if (cur_allocr != NULL) { + int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer); + if (cur_backend != -1) { SET_CAUSE(node, "1.dst"); - return cur_allocr; + return cur_backend; } // view_src - if (node->view_src != NULL) { - cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer); - if (cur_allocr != NULL) { + if (tensor->view_src != NULL) { + cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer); + if (cur_backend != -1) { SET_CAUSE(node, "1.vsrc"); - return cur_allocr; + return cur_backend; } } // assign nodes that use weights to the backend of the weights for (int i = 0; i < GGML_MAX_SRC; i++) { - const struct ggml_tensor * src = node->src[i]; + const struct ggml_tensor * src = tensor->src[i]; if (src == NULL) { break; } if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) { - ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer); + int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer); // operations with weights are always run on the same backend as the weights SET_CAUSE(node, "1.wgt%d", i); - return src_allocr; + return src_backend; } } - return NULL; + return -1; } static char * fmt_size(size_t size) { @@ -1051,11 +1054,11 @@ static char * fmt_size(size_t size) { return buffer; } -static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { +static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { int cur_split = 0; for (int i = 0; i < graph->n_nodes; i++) { if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) { - ggml_backend_t split_backend = get_allocr_backend(sched, sched->splits[cur_split].tallocr); + ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id]; fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend), sched->splits[cur_split].n_inputs); for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) { @@ -1069,17 +1072,15 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra if (ggml_is_view_op(node->op)) { continue; } - ggml_tallocr_t node_allocr = node_allocr(node); - ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME: + ggml_backend_t tensor_backend = tensor_backend(node); fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name, - fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node)); + fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node)); for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { break; } - ggml_tallocr_t src_allocr = node_allocr(src); - ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL; + ggml_backend_t src_backend = tensor_backend(src); fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name, fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src)); } @@ -1087,23 +1088,13 @@ static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgra } } -// creates a copy of the tensor with the same memory layout -static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) { - struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor); - for (int i = 0; i < GGML_MAX_DIMS; i++) { - dup->nb[i] = tensor->nb[i]; - } - return dup; -} - - //#define DEBUG_PASS1 //#define DEBUG_PASS2 //#define DEBUG_PASS3 //#define DEBUG_PASS4 // assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend -static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { +static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { // reset splits sched->n_splits = 0; sched->is_reset = false; @@ -1125,28 +1116,28 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // pass 1: assign backends to ops with pre-allocated inputs for (int i = 0; i < graph->n_leafs; i++) { struct ggml_tensor * leaf = graph->leafs[i]; - if (node_allocr(leaf) != NULL) { + if (tensor_backend_id(leaf) != -1) { // do not overwrite user assignments continue; } - node_allocr(leaf) = sched_allocr_from_cur(sched, leaf); + tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf); } for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - if (node_allocr(node) != NULL) { + if (tensor_backend_id(node) != -1) { // do not overwrite user assignments continue; } - node_allocr(node) = sched_allocr_from_cur(sched, node); + tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node); // src for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { break; } - if (node_allocr(src) == NULL) { - node_allocr(src) = sched_allocr_from_cur(sched, src); + if (tensor_backend_id(src) == -1) { + tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src); } } } @@ -1161,22 +1152,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // pass 2.1 expand gpu up { - ggml_tallocr_t cur_allocr = NULL; + int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { continue; } - ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr != NULL) { - if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { + int tensor_backend_id = tensor_backend_id(node); + if (tensor_backend_id != -1) { + if (tensor_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) - cur_allocr = NULL; + cur_backend_id = -1; } else { - cur_allocr = node_allocr; + cur_backend_id = tensor_backend_id; } } else { - node_allocr(node) = cur_allocr; + tensor_backend_id(node) = cur_backend_id; SET_CAUSE(node, "2.1"); } } @@ -1184,22 +1175,22 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // pass 2.2 expand gpu down { - ggml_tallocr_t cur_allocr = NULL; + int cur_backend_id = -1; for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { continue; } - ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr != NULL) { - if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) { + int tensor_backend_id = tensor_backend_id(node); + if (tensor_backend_id != -1) { + if (tensor_backend_id == sched->n_backends - 1) { // skip cpu (lowest prio backend) - cur_allocr = NULL; + cur_backend_id = -1; } else { - cur_allocr = node_allocr; + cur_backend_id = tensor_backend_id; } } else { - node_allocr(node) = cur_allocr; + tensor_backend_id(node) = cur_backend_id; SET_CAUSE(node, "2.2"); } } @@ -1207,17 +1198,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // pass 2.3 expand rest up { - ggml_tallocr_t cur_allocr = NULL; + int cur_backend_id = -1; for (int i = graph->n_nodes - 1; i >= 0; i--) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { continue; } - ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr != NULL) { - cur_allocr = node_allocr; + int tensor_backend_id = tensor_backend_id(node); + if (tensor_backend_id != -1) { + cur_backend_id = tensor_backend_id; } else { - node_allocr(node) = cur_allocr; + tensor_backend_id(node) = cur_backend_id; SET_CAUSE(node, "2.3"); } } @@ -1225,17 +1216,17 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // pass 2.4 expand rest down { - ggml_tallocr_t cur_allocr = NULL; + int cur_backend_id = -1; for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (ggml_is_view_op(node->op)) { continue; } - ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr != NULL) { - cur_allocr = node_allocr; + int tensor_backend_id = tensor_backend_id(node); + if (tensor_backend_id != -1) { + cur_backend_id = tensor_backend_id; } else { - node_allocr(node) = cur_allocr; + tensor_backend_id(node) = cur_backend_id; SET_CAUSE(node, "2.4"); } } @@ -1247,9 +1238,9 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // pass 3: assign backends to remaining src from dst and view_src for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - ggml_tallocr_t cur_allocr = node_allocr(node); - if (node->view_src != NULL && cur_allocr == NULL) { - cur_allocr = node_allocr(node) = node_allocr(node->view_src); + int cur_backend_id = tensor_backend_id(node); + if (node->view_src != NULL && cur_backend_id == -1) { + cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src); SET_CAUSE(node, "3.vsrc"); } for (int j = 0; j < GGML_MAX_SRC; j++) { @@ -1257,14 +1248,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g if (src == NULL) { break; } - ggml_tallocr_t src_allocr = node_allocr(src); - if (src_allocr == NULL) { + int src_backend_id = tensor_backend_id(src); + if (src_backend_id == -1) { if (src->view_src != NULL) { // views are always on the same backend as the source - node_allocr(src) = node_allocr(src->view_src); + tensor_backend_id(src) = tensor_backend_id(src->view_src); SET_CAUSE(src, "3.vsrc"); } else { - node_allocr(src) = cur_allocr; + tensor_backend_id(src) = cur_backend_id; SET_CAUSE(src, "3.cur"); } } @@ -1281,15 +1272,14 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; if (!ggml_is_view_op(node->op)) { - sched->splits[0].tallocr = node_allocr(node); + sched->splits[0].backend_id = tensor_backend_id(node); break; } } sched->splits[0].i_start = 0; sched->splits[0].n_inputs = 0; memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK - ggml_tallocr_t cur_allocr = sched->splits[0].tallocr; - size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr); + int cur_backend_id = sched->splits[0].backend_id; for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; @@ -1297,19 +1287,18 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g continue; } - ggml_tallocr_t node_allocr = node_allocr(node); + int tensor_backend_id = tensor_backend_id(node); - GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now + GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now - if (node_allocr != cur_allocr) { + if (tensor_backend_id != cur_backend_id) { sched->splits[cur_split].i_end = i; cur_split++; GGML_ASSERT(cur_split < GGML_MAX_SPLITS); - sched->splits[cur_split].tallocr = node_allocr; + sched->splits[cur_split].backend_id = tensor_backend_id; sched->splits[cur_split].i_start = i; sched->splits[cur_split].n_inputs = 0; - cur_allocr = node_allocr; - cur_backend_id = sched_allocr_prio(sched, cur_allocr); + cur_backend_id = tensor_backend_id; } // find inputs that are not on the same backend @@ -1318,43 +1307,25 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g if (src == NULL) { break; } - ggml_tallocr_t src_allocr = node_allocr(src); - GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now - if (src_allocr != node_allocr) { + int src_backend_id = tensor_backend_id(src); + assert(src_backend_id != -1); // all inputs should be assigned by now + if (src_backend_id != tensor_backend_id) { // create a copy of the input in the split's backend size_t id = hash_id(src); - if (sched->node_copies[id][cur_backend_id] == NULL) { - ggml_backend_t backend = get_allocr_backend(sched, cur_allocr); + if (sched->tensor_copies[id][cur_backend_id] == NULL) { + ggml_backend_t backend = sched->backends[cur_backend_id]; struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src); ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name); - sched->node_copies[id][cur_backend_id] = tensor_copy; - node_allocr(tensor_copy) = cur_allocr; + sched->tensor_copies[id][cur_backend_id] = tensor_copy; + tensor_backend_id(tensor_copy) = cur_backend_id; SET_CAUSE(tensor_copy, "4.cpy"); int n_inputs = sched->splits[cur_split].n_inputs++; GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); sched->splits[cur_split].inputs[n_inputs] = src; } - node->src[j] = sched->node_copies[id][cur_backend_id]; - -#if 0 - // check if the input is already in the split - bool found = false; - for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) { - if (sched->splits[cur_split].inputs[k] == src) { - found = true; - break; - } - } - - if (!found) { - int n_inputs = sched->splits[cur_split].n_inputs++; - //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr))); - GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS); - sched->splits[cur_split].inputs[n_inputs] = src; - } -#endif + node->src[j] = sched->tensor_copies[id][cur_backend_id]; } } } @@ -1369,30 +1340,30 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g // sanity check: all sources should have the same backend as the node for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - ggml_tallocr_t node_allocr = node_allocr(node); - if (node_allocr == NULL) { + ggml_backend_t tensor_backend = tensor_backend(node); + if (tensor_backend == NULL) { fprintf(stderr, "!!!!!!! %s has no backend\n", node->name); } - if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) { + if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) { fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n", - node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", - node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL"); + node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", + node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL"); } for (int j = 0; j < GGML_MAX_SRC; j++) { struct ggml_tensor * src = node->src[j]; if (src == NULL) { break; } - ggml_tallocr_t src_allocr = node_allocr(src); - if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now + ggml_backend_t src_backend = tensor_backend(src); + if (src_backend != tensor_backend /* && src_backend != NULL */) { fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n", - node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL", - j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL"); + node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", + j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL"); } - if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) { + if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) { fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n", - src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL", - src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL"); + src->name, src_backend ? ggml_backend_name(src_backend) : "NULL", + src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL"); } } } @@ -1406,32 +1377,43 @@ static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * g struct ggml_backend_sched_split * split = &sched->splits[i]; split->graph = ggml_graph_view(graph, split->i_start, split->i_end); - // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split for (int j = 0; j < split->n_inputs; j++) { struct ggml_tensor * input = split->inputs[j]; - struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)]; + struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id]; + // add a dependency to the input source so that it is not freed before the copy is done - GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input); - input_cpy->src[0] = input; + struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input); + sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input); + graph_copy->nodes[graph_copy->n_nodes++] = input_dep; + + // add a dependency to the input copy so that it is allocated at the start of the split + sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id; graph_copy->nodes[graph_copy->n_nodes++] = input_cpy; } for (int j = split->i_start; j < split->i_end; j++) { + sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]); graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j]; } } sched->graph = graph_copy; } -static void sched_alloc_splits(ggml_backend_sched_t sched) { - ggml_gallocr_alloc_graph_n( - sched->galloc, - sched->graph, - sched->hash_set, - sched->node_talloc); +static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) { + // ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids); + if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { +#ifndef NDEBUG + fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n"); +#endif + ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids); + if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) { + fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n"); + return false; + } + } } -static void sched_compute_splits(ggml_backend_sched_t sched) { +static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) { uint64_t copy_us[GGML_MAX_BACKENDS] = {0}; uint64_t compute_us[GGML_MAX_BACKENDS] = {0}; @@ -1439,20 +1421,18 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { for (int i = 0; i < sched->n_splits; i++) { struct ggml_backend_sched_split * split = &splits[i]; - ggml_backend_t split_backend = get_allocr_backend(sched, split->tallocr); - int split_backend_id = sched_backend_prio(sched, split_backend); + int split_backend_id = split->backend_id; + ggml_backend_t split_backend = sched->backends[split_backend_id]; // copy the input tensors to the split backend uint64_t copy_start_us = ggml_time_us(); for (int j = 0; j < split->n_inputs; j++) { struct ggml_tensor * input = split->inputs[j]; - struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id]; + struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id]; GGML_ASSERT(input->buffer != NULL); GGML_ASSERT(input_cpy->buffer != NULL); - // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change - // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times ggml_backend_tensor_copy_async(split_backend, input, input_cpy); } //ggml_backend_synchronize(split_backend); // necessary to measure copy time @@ -1468,7 +1448,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { uint64_t compute_start_us = ggml_time_us(); if (!sched->callback_eval) { - ggml_backend_graph_compute(split_backend, &split->graph); + if (!ggml_backend_graph_compute(split_backend, &split->graph)) { + return false; + } //ggml_backend_synchronize(split_backend); // necessary to measure compute time } else { // similar to ggml_backend_compare_graph_backend @@ -1488,7 +1470,9 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1); - ggml_backend_graph_compute(split_backend, &gv); + if (!ggml_backend_graph_compute(split_backend, &gv)) { + return false; + } if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) { break; @@ -1510,19 +1494,8 @@ static void sched_compute_splits(ggml_backend_sched_t sched) { } } #endif -} - -static void sched_reset(ggml_backend_sched_t sched) { - for (int i = 0; i < sched->n_backends; i++) { - ggml_tallocr_reset(sched->tallocs[i]); - } - // reset state for the next run - size_t hash_size = sched->hash_set.size; - memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); - memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size); - memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size); - sched->is_reset = true; + return true; } ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) { @@ -1532,9 +1505,10 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1); // initialize hash table - sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); - sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1); - sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1); + sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); + sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size); + sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size); + sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size); sched->n_backends = n_backends; for (int i = 0; i < n_backends; i++) { @@ -1542,14 +1516,9 @@ ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_back sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]); } - sched->galloc = ggml_gallocr_new(); + sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends); - // init measure allocs for each backend - for (int i = 0; i < n_backends; i++) { - sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]); - } - - sched_reset(sched); + ggml_backend_sched_reset(sched); return sched; } @@ -1558,49 +1527,54 @@ void ggml_backend_sched_free(ggml_backend_sched_t sched) { if (sched == NULL) { return; } - for (int i = 0; i < sched->n_backends; i++) { - ggml_tallocr_free(sched->tallocs[i]); - } ggml_gallocr_free(sched->galloc); ggml_free(sched->ctx); free(sched->hash_set.keys); - free(sched->node_talloc); - free(sched->node_copies); + free(sched->tensor_backend_id); + free(sched->tensor_copies); + free(sched->node_backend_ids); free(sched); } -void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { - GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once +void ggml_backend_sched_reset(ggml_backend_sched_t sched) { + // reset state for the next run + size_t hash_size = sched->hash_set.size; + memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT + memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size); + memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size); - sched_split_graph(sched, measure_graph); - sched_alloc_splits(sched); + sched->is_reset = true; +} - // allocate buffers and reset allocators - for (int i = 0; i < sched->n_backends; i++) { - size_t size = ggml_tallocr_max_size(sched->tallocs[i]); - ggml_tallocr_free(sched->tallocs[i]); - sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size); +bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) { + ggml_backend_sched_split_graph(sched, measure_graph); + + if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) { + return false; } - sched_reset(sched); + ggml_backend_sched_reset(sched); + return true; } -void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { +bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) { GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS); if (!sched->is_reset) { - sched_reset(sched); + ggml_backend_sched_reset(sched); } - sched_split_graph(sched, graph); - sched_alloc_splits(sched); - sched_compute_splits(sched); -} + ggml_backend_sched_split_graph(sched, graph); + if (!ggml_backend_sched_alloc_splits(sched)) { + return false; + } -void ggml_backend_sched_reset(ggml_backend_sched_t sched) { - sched_reset(sched); -} + if (!ggml_backend_sched_compute_splits(sched)) { + return false; + } + return true; +} void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) { sched->callback_eval = callback; @@ -1611,37 +1585,30 @@ int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) { return sched->n_splits; } -ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) { - int backend_index = sched_backend_prio(sched, backend); - GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); - return sched->tallocs[backend_index]; -} - -ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) { - int backend_index = sched_backend_prio(sched, backend); +size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) { + int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); - return ggml_tallocr_get_buffer(sched->tallocs[backend_index]); + return ggml_gallocr_get_buffer_size(sched->galloc, backend_index); } void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) { - int backend_index = sched_backend_prio(sched, backend); + int backend_index = ggml_backend_sched_backend_id(sched, backend); GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends); - node_allocr(node) = sched->tallocs[backend_index]; + tensor_backend_id(node) = backend_index; } ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) { - ggml_tallocr_t allocr = node_allocr(node); - if (allocr == NULL) { + int backend_index = tensor_backend_id(node); + if (backend_index == -1) { return NULL; } - return get_allocr_backend(sched, allocr); + return sched->backends[backend_index]; } // utils void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) { GGML_ASSERT(tensor->buffer == NULL); - //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend GGML_ASSERT(tensor->view_src != NULL); GGML_ASSERT(tensor->view_src->buffer != NULL); GGML_ASSERT(tensor->view_src->data != NULL); @@ -1665,7 +1632,7 @@ void ggml_backend_tensor_alloc(ggml_backend_buffer_t buffer, struct ggml_tensor ggml_backend_buffer_init_tensor(buffer, tensor); } -static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, +static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) { GGML_ASSERT(src != NULL); @@ -1678,7 +1645,7 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src); if (src->view_src != NULL) { - dst->view_src = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src); + dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src); dst->view_offs = src->view_offs; } dst->op = src->op; @@ -1691,14 +1658,14 @@ static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, stru if (s == NULL) { break; } - dst->src[i] = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s); + dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s); } node_copies[id] = dst; return dst; } -static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) { +static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) { size_t id = ggml_hash_find(hash_set, src); if (node_init[id]) { return; @@ -1707,7 +1674,7 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor struct ggml_tensor * dst = node_copies[id]; if (dst->view_src != NULL) { - graph_init_tensor(hash_set, node_copies, node_init, src->view_src); + graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src); ggml_backend_view_init(dst->view_src->buffer, dst); } else { @@ -1720,17 +1687,17 @@ static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor if (s == NULL) { break; } - graph_init_tensor(hash_set, node_copies, node_init, s); + graph_copy_init_tensor(hash_set, node_copies, node_init, s); } } struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) { struct ggml_hash_set hash_set = { /* .size = */ graph->visited_hash_table.size, - /* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1) + /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT }; - struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1); - bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1); + struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT + bool * node_init = calloc(sizeof(node_init[0]), hash_set.size); struct ggml_init_params params = { /* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false), @@ -1759,7 +1726,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s // dup nodes for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node); + graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node); } // allocate nodes @@ -1784,7 +1751,7 @@ struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, s // copy data and init views for (int i = 0; i < graph->n_nodes; i++) { struct ggml_tensor * node = graph->nodes[i]; - graph_init_tensor(hash_set, node_copies, node_init, node); + graph_copy_init_tensor(hash_set, node_copies, node_init, node); } // build graph copy diff --git a/src/ggml.c b/src/ggml.c index 86cd6586..99605733 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -2607,7 +2607,7 @@ static struct ggml_tensor * ggml_new_tensor_impl( /*.nb =*/ { 0, 0, 0, 0 }, /*.op =*/ GGML_OP_NONE, /*.op_params =*/ { 0 }, - /*.is_param =*/ false, + /*.flags =*/ 0, /*.grad =*/ NULL, /*.src =*/ { NULL }, /*.perf_runs =*/ 0, @@ -6509,7 +6509,7 @@ struct ggml_tensor * ggml_cross_entropy_loss_back( void ggml_set_param( struct ggml_context * ctx, struct ggml_tensor * tensor) { - tensor->is_param = true; + tensor->flags |= GGML_TENSOR_FLAG_PARAM; GGML_ASSERT(tensor->grad == NULL); tensor->grad = ggml_dup_tensor(ctx, tensor); @@ -15311,7 +15311,7 @@ static struct ggml_tensor * ggml_recompute_graph_node( return NULL; } - if (node->is_param) { + if (node->flags & GGML_TENSOR_FLAG_PARAM) { return node; } @@ -15345,7 +15345,7 @@ static struct ggml_tensor * ggml_recompute_graph_node( clone->op = node->op; clone->grad = node->grad; - clone->is_param = node->is_param; + clone->flags = node->flags; clone->extra = node->extra; for (int k = 0; k < GGML_MAX_DIMS; ++k) { clone->nb[k] = node->nb[k]; @@ -16377,7 +16377,7 @@ void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * for (int i = 0; i < gf->n_nodes; i++) { struct ggml_tensor * node = gf->nodes[i]; - if (node->is_param) { + if (node->flags & GGML_TENSOR_FLAG_PARAM) { GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node); ggml_build_forward_expand(gb, node->grad); } @@ -17862,7 +17862,7 @@ void ggml_graph_print(const struct ggml_cgraph * cgraph) { GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n", i, node->ne[0], node->ne[1], node->ne[2], - ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs, + ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs, (double) node->perf_cycles / (double) ggml_cycles_per_ms(), (double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs, (double) node->perf_time_us / 1000.0, @@ -17955,7 +17955,7 @@ void ggml_graph_dump_dot(const struct ggml_cgraph * gb, const struct ggml_cgraph continue; } - if (node->is_param) { + if (node->flags & GGML_TENSOR_FLAG_PARAM) { snprintf(color, sizeof(color), "yellow"); } else if (node->grad) { if (ggml_graph_find(gf, node)) { @@ -18129,7 +18129,7 @@ static enum ggml_opt_result ggml_opt_adam( int np = 0; int64_t nx = 0; for (int i = 0; i < gf->n_nodes; ++i) { - if (gf->nodes[i]->is_param) { + if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) { GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); GGML_ASSERT(np < GGML_MAX_PARAMS); @@ -18492,7 +18492,7 @@ static enum ggml_opt_result ggml_opt_lbfgs( int np = 0; int nx = 0; for (int i = 0; i < gf->n_nodes; ++i) { - if (gf->nodes[i]->is_param) { + if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) { GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op); GGML_ASSERT(np < GGML_MAX_PARAMS); @@ -18967,6 +18967,16 @@ enum ggml_opt_result ggml_opt_resume_g( //////////////////////////////////////////////////////////////////////////////// +void ggml_set_input(struct ggml_tensor * tensor) { + tensor->flags |= GGML_TENSOR_FLAG_INPUT; +} + +void ggml_set_output(struct ggml_tensor * tensor) { + tensor->flags |= GGML_TENSOR_FLAG_OUTPUT; +} + +//////////////////////////////////////////////////////////////////////////////// + void ggml_quantize_init(enum ggml_type type) { ggml_critical_section_start(); diff --git a/tests/test-backend-buffer.cpp b/tests/test-backend-buffer.cpp index 0110144f..d3c9fe0f 100644 --- a/tests/test-backend-buffer.cpp +++ b/tests/test-backend-buffer.cpp @@ -40,7 +40,7 @@ static void test_buffer(ggml_backend_t backend, ggml_backend_buffer_type_t buft) GGML_ASSERT(ggml_backend_buffer_get_alloc_size(buffer, tensor) >= n * sizeof(float)); - ggml_tallocr_t allocr = ggml_tallocr_new_from_buffer(buffer); + ggml_tallocr_t allocr = ggml_tallocr_new(buffer); ggml_tallocr_alloc(allocr, tensor); GGML_ASSERT(tensor->data != NULL); diff --git a/tests/test-conv1d.cpp b/tests/test-conv1d.cpp index af0dd667..79d18b87 100644 --- a/tests/test-conv1d.cpp +++ b/tests/test-conv1d.cpp @@ -111,10 +111,10 @@ void load_model(test_model & model, bool use_gpu = false) { model.b = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, IL, IC, N); // create a allocator - ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer); + ggml_tallocr_t alloc = ggml_tallocr_new(model.buffer); // alloc memory - ggml_allocr_alloc(alloc, model.a); + ggml_tallocr_alloc(alloc, model.a); // load data to buffer if(ggml_backend_is_cpu(model.backend)) { @@ -124,7 +124,7 @@ void load_model(test_model & model, bool use_gpu = false) { } // alloc memory - ggml_allocr_alloc(alloc, model.b); + ggml_tallocr_alloc(alloc, model.b); if(ggml_backend_is_cpu(model.backend) #ifdef GGML_USE_METAL @@ -136,17 +136,17 @@ void load_model(test_model & model, bool use_gpu = false) { ggml_backend_tensor_set(model.b, bdata, 0, ggml_nbytes(model.b)); } - ggml_allocr_free(alloc); + ggml_tallocr_free(alloc); } -struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * allocr) { +struct ggml_cgraph * build_graph(const test_model& model) { static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); static std::vector buf(buf_size); struct ggml_init_params params0 = { /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf.data(), - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() }; // create a temporally context to build the graph @@ -172,14 +172,11 @@ struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * a return gf; } -struct ggml_cgraph* compute_graph(const test_model & model, struct ggml_allocr * allocr) { - // reset the allocator to free all the memory allocated during the previous inference - ggml_allocr_reset(allocr); - - struct ggml_cgraph * gf = build_graph(model, allocr); +struct ggml_cgraph* compute_graph(const test_model & model, ggml_gallocr_t allocr) { + struct ggml_cgraph * gf = build_graph(model); // allocate tensors - ggml_allocr_alloc_graph(allocr, gf); + ggml_gallocr_alloc_graph(allocr, gf); int n_threads = 1; if (ggml_backend_is_cpu(model.backend)) { @@ -206,20 +203,17 @@ int main(void) test_model model; load_model(model, true); - ggml_backend_buffer_t buf_compute; // for compute - struct ggml_allocr * allocr = NULL; + ggml_gallocr_t allocr = NULL; { - allocr = ggml_allocr_new_measure_from_backend(model.backend); + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); //create the worst case graph for memory usage estimation - struct ggml_cgraph * gf = build_graph(model, allocr); - size_t mem_size = ggml_allocr_alloc_graph(allocr, gf); - ggml_allocr_free(allocr); + struct ggml_cgraph * gf = build_graph(model); // compute the required memory - buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size); - allocr = ggml_allocr_new_from_buffer(buf_compute); + ggml_gallocr_reserve(allocr, gf); + size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0); fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f); } @@ -297,7 +291,7 @@ int main(void) ggml_free(model.ctx); ggml_backend_buffer_free(model.buffer); - ggml_backend_buffer_free(buf_compute); ggml_backend_free(model.backend); + ggml_gallocr_free(allocr); return 0; } diff --git a/tests/test-conv2d.cpp b/tests/test-conv2d.cpp index b9e9e262..1aea5eef 100644 --- a/tests/test-conv2d.cpp +++ b/tests/test-conv2d.cpp @@ -111,10 +111,10 @@ void load_model(test_model & model, bool use_gpu = false) { model.b = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, IW, IH, IC, N); // create a allocator - ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer); + ggml_tallocr_t alloc = ggml_tallocr_new(model.buffer); // alloc memory - ggml_allocr_alloc(alloc, model.a); + ggml_tallocr_alloc(alloc, model.a); // load data to buffer if(ggml_backend_is_cpu(model.backend)) { @@ -124,7 +124,7 @@ void load_model(test_model & model, bool use_gpu = false) { } // alloc memory - ggml_allocr_alloc(alloc, model.b); + ggml_tallocr_alloc(alloc, model.b); if(ggml_backend_is_cpu(model.backend) #ifdef GGML_USE_METAL @@ -136,17 +136,17 @@ void load_model(test_model & model, bool use_gpu = false) { ggml_backend_tensor_set(model.b, bdata, 0, ggml_nbytes(model.b)); } - ggml_allocr_free(alloc); + ggml_tallocr_free(alloc); } -struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * allocr) { +struct ggml_cgraph * build_graph(const test_model& model) { static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); static std::vector buf(buf_size); struct ggml_init_params params0 = { /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf.data(), - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() }; // create a temporally context to build the graph @@ -175,14 +175,11 @@ struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * a return gf; } -struct ggml_cgraph * compute_graph(const test_model & model, struct ggml_allocr * allocr) { - // reset the allocator to free all the memory allocated during the previous inference - ggml_allocr_reset(allocr); - - struct ggml_cgraph * gf = build_graph(model, allocr); +struct ggml_cgraph * compute_graph(const test_model & model, ggml_gallocr_t allocr) { + struct ggml_cgraph * gf = build_graph(model); // allocate tensors - ggml_allocr_alloc_graph(allocr, gf); + ggml_gallocr_alloc_graph(allocr, gf); int n_threads = 1; if (ggml_backend_is_cpu(model.backend)) { @@ -209,20 +206,17 @@ int main(void) test_model model; load_model(model, true); - ggml_backend_buffer_t buf_compute; // for compute - struct ggml_allocr * allocr = NULL; + ggml_gallocr_t allocr = NULL; { - allocr = ggml_allocr_new_measure_from_backend(model.backend); + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); //create the worst case graph for memory usage estimation - struct ggml_cgraph * gf = build_graph(model, allocr); - size_t mem_size = ggml_allocr_alloc_graph(allocr, gf); - ggml_allocr_free(allocr); + struct ggml_cgraph * gf = build_graph(model); // compute the required memory - buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size); - allocr = ggml_allocr_new_from_buffer(buf_compute); + ggml_gallocr_reserve(allocr, gf); + size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0); fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f); } @@ -399,7 +393,7 @@ int main(void) ggml_free(model.ctx); ggml_backend_buffer_free(model.buffer); - ggml_backend_buffer_free(buf_compute); ggml_backend_free(model.backend); + ggml_gallocr_free(allocr); return 0; } diff --git a/tests/test-mul-mat.cpp b/tests/test-mul-mat.cpp index 7380c973..f082dea1 100644 --- a/tests/test-mul-mat.cpp +++ b/tests/test-mul-mat.cpp @@ -21,13 +21,6 @@ #include #include -static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) { - (void) level; - (void) user_data; - fputs(text, stderr); - fflush(stderr); -} - struct test_model { struct ggml_tensor * a; struct ggml_tensor * b; @@ -68,7 +61,6 @@ void load_model(test_model & model, float* a, float* b, int M, int N, int K, boo #ifdef GGML_USE_METAL if (use_gpu) { fprintf(stderr, "%s: using Metal backend\n", __func__); - ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr); model.backend = ggml_backend_metal_init(); if (!model.backend) { fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); @@ -93,10 +85,10 @@ void load_model(test_model & model, float* a, float* b, int M, int N, int K, boo printf("Matrix B: [%i, %i]\n", K, N); // create a allocator - ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer); + ggml_tallocr_t alloc = ggml_tallocr_new(model.buffer); // alloc memory - ggml_allocr_alloc(alloc, model.a); + ggml_tallocr_alloc(alloc, model.a); // load data to buffer if(ggml_backend_is_cpu(model.backend) @@ -110,7 +102,7 @@ void load_model(test_model & model, float* a, float* b, int M, int N, int K, boo } // alloc memory - ggml_allocr_alloc(alloc, model.b); + ggml_tallocr_alloc(alloc, model.b); if(ggml_backend_is_cpu(model.backend) #ifdef GGML_USE_METAL @@ -122,17 +114,17 @@ void load_model(test_model & model, float* a, float* b, int M, int N, int K, boo ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b)); // cuda requires copy the data directly to device } - ggml_allocr_free(alloc); + ggml_tallocr_free(alloc); } -struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * allocr) { +struct ggml_cgraph * build_graph(const test_model& model) { static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); static std::vector buf(buf_size); struct ggml_init_params params0 = { /*.mem_size =*/ buf_size, /*.mem_buffer =*/ buf.data(), - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() }; // create a temporally context to build the graph @@ -151,14 +143,11 @@ struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * a return gf; } -struct ggml_tensor* compute(const test_model & model, struct ggml_allocr * allocr) { - // reset the allocator to free all the memory allocated during the previous inference - ggml_allocr_reset(allocr); - - struct ggml_cgraph * gf = build_graph(model, allocr); +struct ggml_tensor* compute(const test_model & model, ggml_gallocr_t allocr) { + struct ggml_cgraph * gf = build_graph(model); // allocate tensors - ggml_allocr_alloc_graph(allocr, gf); + ggml_gallocr_alloc_graph(allocr, gf); int n_threads = 1; if (ggml_backend_is_cpu(model.backend)) { @@ -317,21 +306,18 @@ int main(void) test_model model; load_model(model, matrixA, matrixB, M, N, K, true); - ggml_backend_buffer_t buf_compute; // for compute - struct ggml_allocr * allocr = NULL; + ggml_gallocr_t allocr = NULL; { - allocr = ggml_allocr_new_measure_from_backend(model.backend); + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); //create the worst case graph for memory usage estimation - struct ggml_cgraph * gf = build_graph(model, allocr); - size_t mem_size = ggml_allocr_alloc_graph(allocr, gf); - ggml_allocr_free(allocr); + struct ggml_cgraph * gf = build_graph(model); // compute the required memory - buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size); - allocr = ggml_allocr_new_from_buffer(buf_compute); - fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0); + ggml_gallocr_reserve(allocr, gf); + size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0); + fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f); } struct ggml_tensor * result = compute(model, allocr); @@ -363,7 +349,7 @@ int main(void) ggml_free(model.ctx); ggml_backend_buffer_free(model.buffer); - ggml_backend_buffer_free(buf_compute); ggml_backend_free(model.backend); + ggml_gallocr_free(allocr); return 0; }