model="../models-mnt/gpt-2/ggml-model-gpt-2-117M.bin"
prompts="../examples/prompts/gpt-2.txt"
- (time ./bin/gpt-2-backend2 --model ${model} -s 1234 -n 64 -tt ${prompts} ) 2>&1 | tee -a $OUT/${ci}-tg.log
- (time ./bin/gpt-2-backend2 --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
+ (time ./bin/gpt-2-backend --model ${model} -s 1234 -n 64 -tt ${prompts} ) 2>&1 | tee -a $OUT/${ci}-tg.log
+ (time ./bin/gpt-2-backend --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
+ (time ./bin/gpt-2-sched --model ${model} -s 1234 -n 64 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
(time ./bin/gpt-2-batched --model ${model} -s 1234 -n 64 -np 8 -p "I believe the meaning of life is") 2>&1 | tee -a $OUT/${ci}-tg.log
add_executable(${TEST_TARGET} main-backend.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
-set(TEST_TARGET gpt-2-backend2)
-add_executable(${TEST_TARGET} main.cpp)
+set(TEST_TARGET gpt-2-sched)
+add_executable(${TEST_TARGET} main-sched.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
#
#include "ggml/ggml.h"
#include "ggml/ggml-alloc.h"
+#include "ggml/ggml-backend.h"
#include "common.h"
#include "common-ggml.h"
struct ggml_tensor * memory_v;
//
- struct ggml_context * ctx;
+ struct ggml_context * ctx_w;
std::map<std::string, struct ggml_tensor *> tensors;
};
return false;
}
- auto & ctx = model.ctx;
+ auto & ctx = model.ctx_w;
size_t ctx_size = 0;
/*.no_alloc =*/ false,
};
- model.ctx = ggml_init(params);
- if (!model.ctx) {
+ model.ctx_w = ggml_init(params);
+ if (!model.ctx_w) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
// build the computation graph
struct ggml_cgraph * gpt2_graph(
const gpt2_model & model,
- struct ggml_allocr * allocr,
const int n_past,
- const std::vector<gpt_vocab::id> & embd_inp) {
- const int N = embd_inp.size();
+ const int n_tokens) {
+ const int N = n_tokens;
const auto & hparams = model.hparams;
struct ggml_init_params params = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
- /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+ /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
};
struct ggml_context * ctx0 = ggml_init(params);
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
- ggml_allocr_alloc(allocr, embd);
-
- // avoid writing to tensors if we are only measuring the memory usage
- if (!ggml_allocr_is_measure(allocr)) {
- memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
- }
+ // at this point, the tensor data is not allocated yet and cannot be set
+ // we will find the tensor after the graph is allocated by its name, and set the data then
+ ggml_set_name(embd, "embd");
+ // setting a tensor as an input will ensure that it is allocated at the beginning of the graph
+ // this is important to ensure that the input tensors are not overwritten before they are used
+ ggml_set_input(embd);
struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
- ggml_allocr_alloc(allocr, position);
- if (!ggml_allocr_is_measure(allocr)) {
- for (int i = 0; i < N; ++i) {
- ((int32_t *) position->data)[i] = n_past + i;
- }
- }
+ ggml_set_name(position, "position");
+ ggml_set_input(position);
// wte + wpe
struct ggml_tensor * inpL =
// [ 768, 50257] - model.lm_head
// [ 768, N] - inpL
inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+ ggml_set_name(inpL, "logits");
+ // setting a tensor as the output will ensure that it is not overwritten by subsequent operations
+ ggml_set_output(inpL);
// logits -> probs
//inpL = ggml_soft_max(ctx0, inpL);
// evaluate the transformer
//
// - model: the model
-// - allocr: ggml_allocr to use to allocate the compute buffer
+// - allocr: ggml_gallocr to use to allocate the compute buffer
// - n_threads: number of threads to use
// - n_past: the context size so far
// - embd_inp: the embeddings of the tokens in the context
//
bool gpt2_eval(
const gpt2_model & model,
- struct ggml_allocr * allocr,
+ ggml_gallocr_t allocr,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp,
const int n_vocab = hparams.n_vocab;
- // reset the allocator to free all the memory allocated during the previous inference
- ggml_allocr_reset(allocr);
+ struct ggml_cgraph * gf = gpt2_graph(model, n_past, embd_inp.size());
- struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
+ // allocate the graph tensors
+ ggml_gallocr_alloc_graph(allocr, gf);
- // allocate tensors
- ggml_allocr_alloc_graph(allocr, gf);
+ // set the graph inputs
+ struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
+ memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+
+ struct ggml_tensor * position = ggml_graph_get_tensor(gf, "position");
+ for (int i = 0; i < N; ++i) {
+ ((int32_t *) position->data)[i] = n_past + i;
+ }
// run the computation
struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
//}
- // in this case, the output tensor is the last one in the graph
- struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
+ // get the graph outputs
+ struct ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits");
//embd_w.resize(n_vocab*N);
- //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+ //memcpy(embd_w.data(), ggml_get_data(logits), sizeof(float)*n_vocab*N);
// return result just for the last token
embd_w.resize(n_vocab);
- memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+ memcpy(embd_w.data(), (float *) ggml_get_data(logits) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
return true;
}
test_gpt_tokenizer(vocab, params.token_test);
}
- // keep this buffer alive while evaluating the model
- std::vector<uint8_t> compute_buffer;
-
- struct ggml_allocr * allocr = NULL;
+ ggml_gallocr_t allocr = NULL;
// allocate the compute buffer
{
- allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
+ allocr = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
// create the worst case graph for memory usage estimation
int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
int n_past = model.hparams.n_ctx - n_tokens;
- struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
-
- // compute the required memory
- size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
-
- // recreate the allocator with the required memory
- ggml_allocr_free(allocr);
- compute_buffer.resize(mem_size);
- allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
+ struct ggml_cgraph * gf = gpt2_graph(model, n_past, n_tokens);
+ // pre-allocate the compute buffer for the worst case (optional)
+ ggml_gallocr_reserve(allocr, gf);
+ size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);
fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
}
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
}
- ggml_free(model.ctx);
+ ggml_free(model.ctx_w);
return 0;
}
struct ggml_tensor * memory_v;
//
- struct ggml_context * ctx;
+ struct ggml_context * ctx_w;
+ struct ggml_context * ctx_kv;
ggml_backend_t backend = NULL;
return false;
}
- auto & ctx = model.ctx;
+ auto & ctx = model.ctx_w;
// create the ggml context
{
// key + value memory
{
+ auto * ctx = model.ctx_kv;
+
+ // create the ggml context
+ {
+ size_t n_tensors = 2;
+ struct ggml_init_params params = {
+ /*.mem_size =*/ ggml_tensor_overhead() * n_tensors,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ ctx = ggml_init(params);
+ if (!ctx) {
+ fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+ return false;
+ }
+ }
+
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
- const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+ // allocate the KV memory in a backend buffer
+ model.buffer_kv = ggml_backend_alloc_ctx_tensors(ctx, model.backend);
+ const size_t memory_size = ggml_backend_buffer_get_size(model.buffer_kv);
printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-
- // create a backend buffer (can be in host or device memory)
- model.buffer_kv = ggml_backend_alloc_buffer(model.backend, memory_size + 256);
-
- // allocate the tensors into the backend buffer
- {
- ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv);
-
- // this updates the pointers in the tensors to point to the correct location in the buffer
- // this is necessary since the ggml_context is .no_alloc == true
- // note that the buffer can actually be a device buffer, depending on the backend
- ggml_allocr_alloc(alloc, model.memory_k);
- ggml_allocr_alloc(alloc, model.memory_v);
-
- ggml_allocr_free(alloc);
- }
}
// load weights
return false;
}
- if (ggml_backend_is_cpu (model.backend)
-#ifdef GGML_USE_METAL
- || ggml_backend_is_metal(model.backend)
-#endif
- ) {
- // for the CPU and Metal backend, we can read directly into the tensor
+ if (ggml_backend_buffer_is_host(model.buffer_w)) {
+ // for some backends such as CPU and Metal, the tensor data is in system memory and we can read directly into it
fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
} else {
// read into a temporary buffer first, then copy to device memory
// GPT-2 models share the WTE tensor as the LM head
if (name == "model/wte" && has_lm_head == false) {
- //ggml_allocr_alloc(alloc, model.lm_head);
//ggml_backend_tensor_copy(tensor, model.lm_head);
model.lm_head = tensor;
}
// build the computation graph
struct ggml_cgraph * gpt2_graph(
const gpt2_model & model,
- struct ggml_allocr * allocr,
const int n_past,
- const std::vector<gpt_vocab::id> & embd_inp) {
- const int N = embd_inp.size();
+ const int n_tokens) {
+ const int N = n_tokens;
const auto & hparams = model.hparams;
struct ggml_init_params params = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
- /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+ /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
};
- struct ggml_context * ctx0 = ggml_init(params);
+ struct ggml_context * ctx = ggml_init(params);
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, GPT2_MAX_NODES, false);
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
- ggml_allocr_alloc(allocr, embd);
+ struct ggml_tensor * embd = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
+ // at this point, the tensor data is not allocated yet and cannot be set
+ // we will find the tensor after the graph is allocated by its name, and set the data then
+ ggml_set_name(embd, "embd");
+ // setting a tensor as an input will ensure that it is allocated at the beginning of the graph
+ // this is important to ensure that the input tensors are not overwritten before they are used
+ ggml_set_input(embd);
- // avoid writing to tensors if we are only measuring the memory usage
- if (!ggml_allocr_is_measure(allocr)) {
- ggml_backend_tensor_set(embd, embd_inp.data(), 0, N*ggml_element_size(embd));
- }
-
- struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
- ggml_allocr_alloc(allocr, position);
- if (!ggml_allocr_is_measure(allocr)) {
- for (int i = 0; i < N; ++i) {
- int32_t v = n_past + i;
- ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v));
- }
- }
+ struct ggml_tensor * position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
+ ggml_set_name(position, "position");
+ ggml_set_input(position);
// wte + wpe
struct ggml_tensor * inpL =
- ggml_add(ctx0,
- ggml_get_rows(ctx0, model.wte, embd),
- ggml_get_rows(ctx0, model.wpe, position));
+ ggml_add(ctx,
+ ggml_get_rows(ctx, model.wte, embd),
+ ggml_get_rows(ctx, model.wpe, position));
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur;
// norm
{
// [ 768, N]
- cur = ggml_norm(ctx0, inpL, hparams.eps);
+ cur = ggml_norm(ctx, inpL, hparams.eps);
// cur = ln_1_g*cur + ln_1_b
// [ 768, N]
- cur = ggml_add(ctx0,
- ggml_mul(ctx0,
+ cur = ggml_add(ctx,
+ ggml_mul(ctx,
cur,
model.layers[il].ln_1_g),
model.layers[il].ln_1_b);
// cur = attn_w*cur + attn_b
// [2304, N]
{
- cur = ggml_mul_mat(ctx0,
+ cur = ggml_mul_mat(ctx,
model.layers[il].c_attn_attn_w,
cur);
- cur = ggml_add(ctx0,
+ cur = ggml_add(ctx,
cur,
model.layers[il].c_attn_attn_b);
}
// self-attention
{
- struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
- struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
- struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+ struct ggml_tensor * Qcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+ struct ggml_tensor * Kcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+ struct ggml_tensor * Vcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
// store key and value to memory
if (N >= 1) {
- struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
- struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+ struct ggml_tensor * k = ggml_view_1d(ctx, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+ struct ggml_tensor * v = ggml_view_1d(ctx, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx, Kcur, k));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx, Vcur, v));
}
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
// [64, N, 12]
struct ggml_tensor * Q =
- ggml_permute(ctx0,
- ggml_cpy(ctx0,
+ ggml_permute(ctx,
+ ggml_cpy(ctx,
Qcur,
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+ ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
0, 2, 1, 3);
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
// [64, n_past + N, 12]
struct ggml_tensor * K =
- ggml_permute(ctx0,
- ggml_reshape_3d(ctx0,
- ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+ ggml_permute(ctx,
+ ggml_reshape_3d(ctx,
+ ggml_view_1d(ctx, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
n_embd/n_head, n_head, n_past + N),
0, 2, 1, 3);
// K * Q
// [n_past + N, N, 12]
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx, K, Q);
// KQ_scaled = KQ / sqrt(n_embd/n_head)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_scaled =
- ggml_scale(ctx0,
+ ggml_scale(ctx,
KQ,
1.0f/sqrtf(float(n_embd)/n_head));
// KQ_masked = mask_past(KQ_scaled)
// [n_past + N, N, 12]
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx, KQ_scaled, n_past);
// KQ = soft_max(KQ_masked)
// [n_past + N, N, 12]
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx, KQ_masked);
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
// [n_past + N, 64, 12]
struct ggml_tensor * V_trans =
- ggml_cpy(ctx0,
- ggml_permute(ctx0,
- ggml_reshape_3d(ctx0,
- ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+ ggml_cpy(ctx,
+ ggml_permute(ctx,
+ ggml_reshape_3d(ctx,
+ ggml_view_1d(ctx, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3),
- ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
+ ggml_new_tensor_3d(ctx, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
// KQV = transpose(V) * KQ_soft_max
// [64, N, 12]
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx, V_trans, KQ_soft_max);
// KQV_merged = KQV.permute(0, 2, 1, 3)
// [64, 12, N]
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx, KQV, 0, 2, 1, 3);
// cur = KQV_merged.contiguous().view(n_embd, N)
// [768, N]
- cur = ggml_cpy(ctx0,
+ cur = ggml_cpy(ctx,
KQV_merged,
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+ ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, N));
}
// projection
// cur = proj_w*cur + proj_b
// [768, N]
{
- cur = ggml_mul_mat(ctx0,
+ cur = ggml_mul_mat(ctx,
model.layers[il].c_attn_proj_w,
cur);
- cur = ggml_add(ctx0,
+ cur = ggml_add(ctx,
cur,
model.layers[il].c_attn_proj_b);
}
// add the input
- cur = ggml_add(ctx0, cur, inpL);
+ cur = ggml_add(ctx, cur, inpL);
struct ggml_tensor * inpFF = cur;
{
// norm
{
- cur = ggml_norm(ctx0, inpFF, hparams.eps);
+ cur = ggml_norm(ctx, inpFF, hparams.eps);
// cur = ln_2_g*cur + ln_2_b
// [ 768, N]
- cur = ggml_add(ctx0,
- ggml_mul(ctx0,
+ cur = ggml_add(ctx,
+ ggml_mul(ctx,
cur,
model.layers[il].ln_2_g),
model.layers[il].ln_2_b);
//
// cur = fc_w*cur + fc_b
// [3072, N]
- cur = ggml_mul_mat(ctx0,
+ cur = ggml_mul_mat(ctx,
model.layers[il].c_mlp_fc_w,
cur);
- cur = ggml_add(ctx0,
+ cur = ggml_add(ctx,
cur,
model.layers[il].c_mlp_fc_b);
// GELU activation
// [3072, N]
- cur = ggml_gelu(ctx0, cur);
+ cur = ggml_gelu(ctx, cur);
// projection
// [ 768, 3072] - model.layers[il].c_mlp_proj_w
//
// cur = proj_w*cur + proj_b
// [768, N]
- cur = ggml_mul_mat(ctx0,
+ cur = ggml_mul_mat(ctx,
model.layers[il].c_mlp_proj_w,
cur);
- cur = ggml_add(ctx0,
+ cur = ggml_add(ctx,
cur,
model.layers[il].c_mlp_proj_b);
}
// input for next layer
- inpL = ggml_add(ctx0, cur, inpFF);
+ inpL = ggml_add(ctx, cur, inpFF);
}
// norm
{
// [ 768, N]
- inpL = ggml_norm(ctx0, inpL, hparams.eps);
+ inpL = ggml_norm(ctx, inpL, hparams.eps);
// inpL = ln_f_g*inpL + ln_f_b
// [ 768, N]
- inpL = ggml_add(ctx0,
- ggml_mul(ctx0,
+ inpL = ggml_add(ctx,
+ ggml_mul(ctx,
inpL,
model.ln_f_g),
model.ln_f_b);
// inpL = WTE * inpL
// [ 768, 50257] - model.lm_head
// [ 768, N] - inpL
- inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+ inpL = ggml_mul_mat(ctx, model.lm_head, inpL);
+ ggml_set_name(inpL, "logits");
+ // setting a tensor as the output will ensure that it is not overwritten by subsequent operations
+ ggml_set_output(inpL);
// logits -> probs
//inpL = ggml_soft_max(ctx0, inpL);
ggml_build_forward_expand(gf, inpL);
- ggml_free(ctx0);
+ ggml_free(ctx);
return gf;
}
// evaluate the transformer
//
// - model: the model
-// - allocr: ggml_allocr to use to allocate the compute buffer
+// - allocr: ggml_gallocr to use to allocate the compute buffer
// - n_threads: number of threads to use
// - n_past: the context size so far
// - embd_inp: the embeddings of the tokens in the context
//
bool gpt2_eval(
const gpt2_model & model,
- struct ggml_allocr * allocr,
+ ggml_gallocr_t allocr,
const int n_threads,
const int n_past,
const std::vector<gpt_vocab::id> & embd_inp,
const int n_vocab = hparams.n_vocab;
- // reset the allocator to free all the memory allocated during the previous inference
- ggml_allocr_reset(allocr);
+ struct ggml_cgraph * gf = gpt2_graph(model, n_past, embd_inp.size());
+
+ // allocate the graph tensors
+ ggml_gallocr_alloc_graph(allocr, gf);
- struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, embd_inp);
+ // set the graph inputs
+ struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
+ ggml_backend_tensor_set(embd, embd_inp.data(), 0, N*ggml_element_size(embd));
- // allocate tensors
- ggml_allocr_alloc_graph(allocr, gf);
+ struct ggml_tensor * position = ggml_graph_get_tensor(gf, "position");
+ for (int i = 0; i < N; ++i) {
+ int32_t v = n_past + i;
+ ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v));
+ }
// set backend options
if (ggml_backend_is_cpu(model.backend)) {
}
#endif
- // test
-#if 0 && defined(GGML_USE_CUBLAS)
- if (ggml_backend_is_cuda(model.backend)) {
- auto eval_callback = [](int index, struct ggml_tensor * t1, struct ggml_tensor * t2, void * user_data) {
- auto tv1 = tensor_to_float(t1);
- auto tv2 = tensor_to_float(t2);
-
-#if 1
- float sim = cosine_similarity(tv1, tv2);
- float len1 = vec_len(tv1);
- float len2 = vec_len(tv2);
- float lenr = len1/len2;
- float lenrd = std::abs(1.0f-lenr);
-
- float angle = acosf(sim)*180.0f/M_PI;
-
- if (angle > 0.5f || lenrd > 0.05f) {
- printf("%3d [%15s] %s: sim = %f, a = %f, lenrd = %f\n", index, ggml_op_desc(t1), t1->name, sim, angle, lenrd);
- }
- assert(sim > 0.90f);
-#else
- float dist = distance(tv1, tv2) / vec_len(tv1);
- if (dist > 0.01f) {
- printf("%3d [%15s] %s: distance = %f\n", index, ggml_op_desc(t1), t1->name, dist);
- }
-#endif
-
- return true;
- };
- ggml_backend_t backend_cpu = ggml_backend_cpu_init();
- ggml_backend_compare_graph_backend(model.backend, backend_cpu, gf, eval_callback, nullptr);
- ggml_backend_free(backend_cpu);
- //printf("done\n");
- } else
-#endif
- {
- // run the computation
- ggml_backend_graph_compute(model.backend, gf);
- }
+ // run the computation
+ ggml_backend_graph_compute(model.backend, gf);
//if (n_past%100 == 0) {
// ggml_graph_print (&gf);
// ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
//}
- // in this case, the output tensor is the last one in the graph
- struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
+ // get the graph outputs
+ struct ggml_tensor * logits = ggml_graph_get_tensor(gf, "logits");
//embd_w.resize(n_vocab*N);
- //ggml_backend_tensor_get(inpL, embd_w.data(), 0, sizeof(float)*n_vocab*N);
+ //ggml_backend_tensor_get(logits, embd_w.data(), 0, sizeof(float)*n_vocab*N);
// return result just for the last token
embd_w.resize(n_vocab);
- ggml_backend_tensor_get(inpL, embd_w.data(), (n_vocab*(N-1))*sizeof(float), sizeof(float)*n_vocab);
+ ggml_backend_tensor_get(logits, embd_w.data(), (n_vocab*(N-1))*sizeof(float), sizeof(float)*n_vocab);
return true;
}
test_gpt_tokenizer(vocab, params.token_test);
}
- // keep this buffer alive while evaluating the model
- ggml_backend_buffer_t buf_compute;
-
- struct ggml_allocr * allocr = NULL;
+ ggml_gallocr_t allocr = NULL;
// allocate the compute buffer
{
- // create an allocator to measure the memory usage
- allocr = ggml_allocr_new_measure_from_backend(model.backend);
+ // create a graph allocator with the backend's default buffer type
+ allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
// create the worst case graph for memory usage estimation
int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
int n_past = model.hparams.n_ctx - n_tokens;
- struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
-
- // compute the required memory
- size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
-
- // recreate the allocator with the required memory
- ggml_allocr_free(allocr);
- buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
- allocr = ggml_allocr_new_from_buffer(buf_compute);
+ struct ggml_cgraph * gf = gpt2_graph(model, n_past, n_tokens);
+ // pre-allocate the compute buffer for the worst case (optional)
+ ggml_gallocr_reserve(allocr, gf);
+ size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);
fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
}
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
}
- ggml_free(model.ctx);
+ ggml_free(model.ctx_w);
+ ggml_gallocr_free(allocr);
ggml_backend_buffer_free(model.buffer_w);
ggml_backend_buffer_free(model.buffer_kv);
- ggml_backend_buffer_free(buf_compute);
ggml_backend_free(model.backend);
return 0;
gpt2_kv_cache kv_cache;
- struct ggml_context * ctx;
+ struct ggml_context * ctx_w;
ggml_backend_t backend = NULL;
return false;
}
- auto & ctx = model.ctx;
+ auto & ctx = model.ctx_w;
size_t buffer_size = 0;
/*.no_alloc =*/ true,
};
- model.ctx = ggml_init(params);
- if (!model.ctx) {
+ model.ctx_w = ggml_init(params);
+ if (!model.ctx_w) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
// allocate the tensors into the backend buffer
{
- ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.kv_cache.buffer);
+ ggml_tallocr * alloc = ggml_tallocr_new(model.kv_cache.buffer);
// this updates the pointers in the tensors to point to the correct location in the buffer
// this is necessary since the ggml_context is .no_alloc == true
// note that the buffer can actually be a device buffer, depending on the backend
- ggml_allocr_alloc(alloc, model.kv_cache.k);
- ggml_allocr_alloc(alloc, model.kv_cache.v);
+ ggml_tallocr_alloc(alloc, model.kv_cache.k);
+ ggml_tallocr_alloc(alloc, model.kv_cache.v);
- ggml_allocr_free(alloc);
+ ggml_tallocr_free(alloc);
}
}
// load weights
{
- ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_w);
+ ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_w);
size_t total_size = 0;
return false;
}
- ggml_allocr_alloc(alloc, tensor);
+ ggml_tallocr_alloc(alloc, tensor);
if (ggml_backend_is_cpu (model.backend)
#ifdef GGML_USE_METAL
// GPT-2 models share the WTE tensor as the LM head
if (name == "model/wte" && has_lm_head == false) {
- //ggml_allocr_alloc(alloc, model.lm_head);
+ //ggml_tallocr_alloc(alloc, model.lm_head);
//ggml_backend_tensor_copy(tensor, model.lm_head);
model.lm_head = tensor;
}
total_size += ggml_nbytes(tensor);
}
- ggml_allocr_free(alloc);
+ ggml_tallocr_free(alloc);
printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
}
// build the computation graph
struct ggml_cgraph * gpt2_graph(
const gpt2_model & model,
- struct ggml_allocr * allocr,
- const gpt2_batch & batch) {
+ const gpt2_batch & batch,
+ bool measure) {
const auto & hparams = model.hparams;
const int n_embd = hparams.n_embd;
const auto & kv_cache = model.kv_cache;
const int32_t n_tokens = batch.n_tokens;
- const int32_t n_kv = ggml_allocr_is_measure(allocr) ? n_ctx : kv_cache.n;
- const int32_t kv_head = ggml_allocr_is_measure(allocr) ? n_ctx - n_tokens : kv_cache.head;
+ const int32_t n_kv = measure ? n_ctx : kv_cache.n;
+ const int32_t kv_head = measure ? n_ctx - n_tokens : kv_cache.head;
// since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
static size_t buf_size = ggml_tensor_overhead()*GPT2_MAX_NODES + ggml_graph_overhead_custom(GPT2_MAX_NODES, false);
struct ggml_init_params params = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
- /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+ /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
};
struct ggml_context * ctx0 = ggml_init(params);
struct ggml_tensor * inpL;
if (batch.token) {
struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
- ggml_allocr_alloc(allocr, inp_tokens);
- if (!ggml_allocr_is_measure(allocr)) {
- ggml_backend_tensor_set(inp_tokens, batch.token, 0, n_tokens*ggml_element_size(inp_tokens));
- }
+ ggml_set_name(inp_tokens, "inp_tokens");
+ ggml_set_input(inp_tokens);
struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
- ggml_allocr_alloc(allocr, position);
- if (!ggml_allocr_is_measure(allocr)) {
- for (int i = 0; i < n_tokens; ++i) {
- int32_t v = batch.pos[i];
- ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v));
- }
- }
+ ggml_set_name(position, "position");
+ ggml_set_input(position);
// wte + wpe
inpL =
GGML_ASSERT(batch.embd);
inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
-
- ggml_allocr_alloc(allocr, inpL);
- if (!ggml_allocr_is_measure(allocr)) {
- ggml_backend_tensor_set(inpL, batch.embd, 0, n_tokens * n_embd * ggml_element_size(inpL));
- }
+ ggml_set_name(inpL, "embd");
+ ggml_set_input(inpL);
}
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
ggml_set_name(KQ_mask, "KQ_mask");
- ggml_allocr_alloc(allocr, KQ_mask);
- if (!ggml_allocr_is_measure(allocr)) {
- std::vector<float> data_buf(n_kv*n_tokens);
- const float neg_inf_v = -INFINITY;
+ ggml_set_input(KQ_mask);
- for (int h = 0; h < 1; ++h) {
- int h_offset = h*(n_kv*n_tokens);
- for (int j = 0; j < n_tokens; ++j) {
- const gpt2_pos pos = batch.pos[j];
- const gpt2_seq_id seq_id = batch.seq_id[j];
-
- for (int i = 0; i < n_kv; ++i) {
- if (!kv_cache.cells[i].has_seq_id(seq_id) || kv_cache.cells[i].pos > pos) {
- data_buf[h_offset + j*n_kv + i] = neg_inf_v;
- }
- }
- }
- }
-
- ggml_backend_tensor_set(KQ_mask, data_buf.data(), 0, data_buf.size() * sizeof(float));
- }
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur;
// 0 - success
// < 0 - error
int gpt2_decode(
- struct gpt2_model & model,
- struct ggml_allocr * allocr,
+ struct gpt2_model & model,
+ ggml_gallocr_t allocr,
struct gpt2_batch batch,
int n_threads,
std::vector<float> & logits) {
cache.n = cache.head + n_tokens;
- // reset the allocator to free all the memory allocated during the previous inference
- ggml_allocr_reset(allocr);
-
- struct ggml_cgraph * gf = gpt2_graph(model, allocr, batch);
+ struct ggml_cgraph * gf = gpt2_graph(model, batch, false);
// allocate tensors
- ggml_allocr_alloc_graph(allocr, gf);
+ ggml_gallocr_alloc_graph(allocr, gf);
+
+ // set the graph inputs
+ if (batch.token) {
+ struct ggml_tensor * inp_tokens = ggml_graph_get_tensor(gf, "inp_tokens");
+ ggml_backend_tensor_set(inp_tokens, batch.token, 0, n_tokens*ggml_element_size(inp_tokens));
+
+ struct ggml_tensor * position = ggml_graph_get_tensor(gf, "position");
+ for (int i = 0; i < n_tokens; ++i) {
+ int32_t v = batch.pos[i];
+ ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v));
+ }
+ } else {
+ struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
+ ggml_backend_tensor_set(embd, batch.embd, 0, n_tokens * hparams.n_embd * ggml_element_size(embd));
+ }
+
+ {
+ struct ggml_tensor * KQ_mask = ggml_graph_get_tensor(gf, "KQ_mask");
+ const auto & kv_cache = model.kv_cache;
+ const int32_t n_tokens = batch.n_tokens;
+ const int32_t n_kv = kv_cache.n;
+
+ std::vector<float> data_buf(n_kv*n_tokens);
+ const float neg_inf_v = -INFINITY;
+
+ for (int h = 0; h < 1; ++h) {
+ int h_offset = h*(n_kv*n_tokens);
+ for (int j = 0; j < n_tokens; ++j) {
+ const gpt2_pos pos = batch.pos[j];
+ const gpt2_seq_id seq_id = batch.seq_id[j];
+
+ for (int i = 0; i < n_kv; ++i) {
+ if (!kv_cache.cells[i].has_seq_id(seq_id) || kv_cache.cells[i].pos > pos) {
+ data_buf[h_offset + j*n_kv + i] = neg_inf_v;
+ }
+ }
+ }
+ }
+
+ ggml_backend_tensor_set(KQ_mask, data_buf.data(), 0, data_buf.size() * sizeof(float));
+ }
// run the computation
if (ggml_backend_is_cpu(model.backend)) {
// tokenize the prompt
std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
- // keep this buffer alive while evaluating the model
- ggml_backend_buffer_t buf_compute;
-
const int n_parallel = params.n_parallel;
const int n_batch_max = std::max(embd_inp.size(), (size_t)n_parallel);
gpt2_batch batch = gpt2_batch_init(n_batch_max, 0);
// prepare required memory and allocate the compute buffer
- struct ggml_allocr * allocr = NULL;
+ ggml_gallocr_t allocr = NULL;
{
// create an allocator to measure the memory usage
- allocr = ggml_allocr_new_measure_from_backend(model.backend);
-
- batch.n_tokens = n_batch_max;
+ allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
// create the worst case graph for memory usage estimation
- struct ggml_cgraph * gf = gpt2_graph(model, allocr, batch);
-
- // compute the required memory
- size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
-
- // recreate the allocator with the required memory
- ggml_allocr_free(allocr);
- buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
- allocr = ggml_allocr_new_from_buffer(buf_compute);
+ batch.n_tokens = n_batch_max;
+ struct ggml_cgraph * gf = gpt2_graph(model, batch, true);
+ // pre-allocate the compute buffer for the worst case (optional)
+ ggml_gallocr_reserve(allocr, gf);
+ size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);
fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
}
}
gpt2_batch_free(batch);
- ggml_free(model.ctx);
+ ggml_free(model.ctx_w);
+ ggml_gallocr_free(allocr);
ggml_backend_buffer_free(model.buffer_w);
ggml_backend_buffer_free(model.kv_cache.buffer);
- ggml_backend_buffer_free(buf_compute);
ggml_backend_free(model.backend);
return 0;
struct ggml_tensor * memory_v;
//
- struct ggml_context * ctx;
+ struct ggml_context * ctx_w;
std::map<std::string, struct ggml_tensor *> tensors;
};
return false;
}
- auto & ctx = model.ctx;
+ auto & ctx = model.ctx_w;
size_t ctx_size = 0;
/*.no_alloc =*/ false,
};
- model.ctx = ggml_init(params);
- if (!model.ctx) {
+ model.ctx_w = ggml_init(params);
+ if (!model.ctx_w) {
fprintf(stderr, "%s: ggml_init() failed\n", __func__);
return false;
}
printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
}
- ggml_free(model.ctx);
+ ggml_free(model.ctx_w);
return 0;
}
--- /dev/null
+#include "ggml/ggml.h"
+#include "ggml/ggml-alloc.h"
+#include "ggml/ggml-backend.h"
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
+
+#include "common.h"
+#include "common-ggml.h"
+
+#include <cassert>
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <fstream>
+#include <map>
+#include <string>
+#include <vector>
+
+#if defined(_MSC_VER)
+#pragma warning(disable: 4244 4267) // possible loss of data
+#endif
+
+#define GPT2_MAX_NODES 4096
+
+static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
+ (void) level;
+ (void) user_data;
+ fputs(text, stderr);
+ fflush(stderr);
+}
+
+// default hparams (GPT-2 117M)
+struct gpt2_hparams {
+ int32_t n_vocab = 50257;
+ int32_t n_ctx = 1024;
+ int32_t n_embd = 768;
+ int32_t n_head = 12;
+ int32_t n_layer = 12;
+ int32_t ftype = 1;
+ float eps = 1e-5f;
+};
+
+struct gpt2_layer {
+ // normalization
+ struct ggml_tensor * ln_1_g;
+ struct ggml_tensor * ln_1_b;
+
+ struct ggml_tensor * ln_2_g;
+ struct ggml_tensor * ln_2_b;
+
+ // attention
+ struct ggml_tensor * c_attn_attn_w;
+ struct ggml_tensor * c_attn_attn_b;
+
+ struct ggml_tensor * c_attn_proj_w;
+ struct ggml_tensor * c_attn_proj_b;
+
+ // mlp
+ struct ggml_tensor * c_mlp_fc_w;
+ struct ggml_tensor * c_mlp_fc_b;
+
+ struct ggml_tensor * c_mlp_proj_w;
+ struct ggml_tensor * c_mlp_proj_b;
+};
+
+struct gpt2_model {
+ gpt2_hparams hparams;
+
+ // normalization
+ struct ggml_tensor * ln_f_g;
+ struct ggml_tensor * ln_f_b;
+
+ struct ggml_tensor * wte; // position embedding
+ struct ggml_tensor * wpe; // token embedding
+ struct ggml_tensor * lm_head; // language model head
+
+ std::vector<gpt2_layer> layers;
+
+ // key + value memory
+ struct ggml_tensor * memory_k;
+ struct ggml_tensor * memory_v;
+
+ //
+ struct ggml_context * ctx_w;
+
+ std::vector<ggml_backend_t> backends;
+ std::vector<ggml_backend_buffer_t> buffers_w;
+ ggml_backend_buffer_t buffer_kv;
+ ggml_backend_buffer_t buffer_input;
+
+ std::map<std::string, struct ggml_tensor *> tensors;
+
+ // inputs/constants
+ struct ggml_tensor * embd;
+ struct ggml_tensor * position;
+};
+
+void init_backends(gpt2_model & model, const gpt_params & params) {
+ ggml_backend_t gpu_backend = NULL;
+
+ // initialize the backends
+#ifdef GGML_USE_CUBLAS
+ if (params.n_gpu_layers > 0) {
+ fprintf(stderr, "%s: using CUDA backend\n", __func__);
+ gpu_backend = ggml_backend_cuda_init(0);
+ if (!gpu_backend) {
+ fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+ }
+ }
+#endif
+
+#ifdef GGML_USE_METAL
+ if (params.n_gpu_layers > 0) {
+ fprintf(stderr, "%s: using Metal backend\n", __func__);
+ ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
+ gpu_backend = ggml_backend_metal_init();
+ if (!gpu_backend) {
+ fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+ } else {
+ ggml_backend_metal_set_n_cb(gpu_backend, params.n_threads);
+ }
+ }
+#endif
+ if (gpu_backend) {
+ model.backends.push_back(gpu_backend);
+ }
+
+ // always add the CPU backend as a fallback
+ ggml_backend_t cpu_backend = ggml_backend_cpu_init();
+ ggml_backend_cpu_set_n_threads(cpu_backend, params.n_threads);
+ model.backends.push_back(cpu_backend);
+}
+
+// load the model's weights from a file
+bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, const gpt_params & params) {
+ printf("%s: loading model from '%s'\n", __func__, fname.c_str());
+
+ auto fin = std::ifstream(fname, std::ios::binary);
+ if (!fin) {
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
+ return false;
+ }
+
+ // verify magic
+ {
+ uint32_t magic;
+ fin.read((char *) &magic, sizeof(magic));
+ if (magic != GGML_FILE_MAGIC) {
+ fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
+ return false;
+ }
+ }
+
+ // load hparams
+ {
+ auto & hparams = model.hparams;
+
+ fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
+ fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
+ fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
+ fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
+ fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
+ fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
+
+ const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
+
+ printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
+ printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
+ printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
+ printf("%s: n_head = %d\n", __func__, hparams.n_head);
+ printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
+ printf("%s: ftype = %d\n", __func__, hparams.ftype);
+ printf("%s: qntvr = %d\n", __func__, qntvr);
+
+ hparams.ftype %= GGML_QNT_VERSION_FACTOR;
+ }
+
+ // load vocab
+ {
+ int32_t n_vocab = 0;
+ fin.read((char *) &n_vocab, sizeof(n_vocab));
+
+ if (n_vocab != model.hparams.n_vocab) {
+ fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
+ __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
+ return false;
+ }
+
+ std::string word;
+ std::vector<char> buf(128);
+
+ for (int i = 0; i < n_vocab; i++) {
+ uint32_t len;
+ fin.read((char *) &len, sizeof(len));
+
+ buf.resize(len);
+ fin.read((char *) buf.data(), len);
+ word.assign(buf.data(), len);
+
+ vocab.token_to_id[word] = i;
+ vocab.id_to_token[i] = word;
+ }
+ }
+
+ // for the big tensors, we have the option to store the data in 16-bit floats or quantized
+ // in order to save memory and also to speed up the computation
+ ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
+ if (wtype == GGML_TYPE_COUNT) {
+ fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
+ __func__, fname.c_str(), model.hparams.ftype);
+ return false;
+ }
+
+ auto & ctx = model.ctx_w;
+
+ // create the ggml context
+ {
+ size_t n_tensors = 3 /* input */ + 2 /* kv */ + 6 + 12*model.hparams.n_layer;
+ struct ggml_init_params params = {
+ /*.mem_size =*/ ggml_tensor_overhead() * n_tensors,
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+
+ model.ctx_w = ggml_init(params);
+ if (!model.ctx_w) {
+ fprintf(stderr, "%s: ggml_init() failed\n", __func__);
+ return false;
+ }
+ }
+
+ // create tensors for the weights
+ {
+ const auto & hparams = model.hparams;
+
+ const int n_embd = hparams.n_embd;
+ const int n_layer = hparams.n_layer;
+ const int n_ctx = hparams.n_ctx;
+ const int n_vocab = hparams.n_vocab;
+
+ model.layers.resize(n_layer);
+
+ model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+ model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+ model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
+ model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
+ model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
+
+ // map by name
+ model.tensors["model/ln_f/g"] = model.ln_f_g;
+ model.tensors["model/ln_f/b"] = model.ln_f_b;
+
+ model.tensors["model/wte"] = model.wte;
+ model.tensors["model/wpe"] = model.wpe;
+ model.tensors["model/lm_head"] = model.lm_head;
+
+ for (int i = 0; i < n_layer; ++i) {
+ auto & layer = model.layers[i];
+
+ layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+ layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+ layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+ layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+ layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
+ layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
+
+ layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
+ layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+ layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
+ layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
+
+ layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
+ layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
+
+ // map by name
+ model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
+ model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
+
+ model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
+ model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
+
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
+
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
+ model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
+
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
+
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
+ model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
+ }
+ }
+
+ // assign tensors to backends
+ init_backends(model, params);
+ ggml_backend_t backend_gpu = model.backends.front();
+ ggml_backend_t backend_cpu = model.backends.back();
+ std::map<std::string, ggml_backend_t> tensor_backends;
+ {
+ const int i_gpu_first_layer = model.hparams.n_layer - params.n_gpu_layers;
+ for (auto it : model.tensors) {
+ const std::string & name = it.first;
+ // input tensors
+ if (name == "model/wte" || name == "model/wpe") {
+ if (params.n_gpu_layers > model.hparams.n_layer) {
+ tensor_backends[name] = backend_gpu;
+ } else {
+ tensor_backends[name] = backend_cpu;
+ }
+ }
+ // output tensors
+ if (name == "model/ln_f/g" || name == "model/ln_f/b" || name == "model/lm_head") {
+ if (params.n_gpu_layers > 0) {
+ tensor_backends[name] = backend_gpu;
+ } else {
+ tensor_backends[name] = backend_cpu;
+ }
+ }
+ // layer tensors
+ if (name.substr(0, 7) == "model/h") {
+ // parse layer number
+ int layer = std::stoi(name.substr(7, 2));
+ if (layer >= i_gpu_first_layer) {
+ tensor_backends[name] = backend_gpu;
+ } else {
+ tensor_backends[name] = backend_cpu;
+ }
+ }
+ }
+ }
+
+ // allocate buffers
+ std::map<ggml_backend_t, std::unique_ptr<ggml_tallocr, decltype(&ggml_tallocr_free)>> backend_buffers;
+ for (auto backend : model.backends) {
+ // compute the size of the buffer
+ size_t size = 0;
+ for (auto it : model.tensors) {
+ if (tensor_backends[it.first] == backend) {
+ size += ggml_nbytes(it.second) + 512;
+ }
+ }
+ if (size > 0) {
+ printf("%s: %8s buffer size = %8.2f MB\n", __func__, ggml_backend_name(backend), size/1024.0/1024.0);
+ // allocate the buffer
+ ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size);
+ ggml_backend_buffer_set_usage(buffer, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+ model.buffers_w.push_back(buffer);
+
+ // create an allocator for the buffer to allocate the tensors
+ auto alloc = std::unique_ptr<ggml_tallocr, decltype(&ggml_tallocr_free)>(ggml_tallocr_new(buffer), ggml_tallocr_free);
+ backend_buffers.insert(std::make_pair(backend, std::move(alloc)));
+ } else {
+ model.buffers_w.push_back(NULL);
+ }
+ }
+
+ // allocate key + value memory
+ {
+ const auto & hparams = model.hparams;
+
+ const int n_embd = hparams.n_embd;
+ const int n_layer = hparams.n_layer;
+ const int n_ctx = hparams.n_ctx;
+
+ const int n_mem = n_layer*n_ctx;
+ const int n_elements = n_embd*n_mem;
+
+ model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+ model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
+
+ ggml_set_name(model.memory_k, "model/memory_k");
+ ggml_set_name(model.memory_v, "model/memory_v");
+
+ const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
+
+ printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+
+ // create a backend buffer (can be in host or device memory)
+ ggml_backend_t backend_kv = params.n_gpu_layers >= hparams.n_layer/2 ? backend_gpu : backend_cpu;
+ printf("%s: backend_kv = %s\n", __func__, ggml_backend_name(backend_kv));
+ model.buffer_kv = ggml_backend_alloc_buffer(backend_kv, memory_size + 512*2);
+
+ // allocate the tensors into the backend buffer
+ {
+ ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_kv);
+
+ // this updates the pointers in the tensors to point to the correct location in the buffer
+ // this is necessary since the ggml_context is .no_alloc == true
+ // note that the buffer can actually be a device buffer, depending on the backend
+ ggml_tallocr_alloc(alloc, model.memory_k);
+ ggml_tallocr_alloc(alloc, model.memory_v);
+
+ ggml_tallocr_free(alloc);
+ }
+ }
+
+ // load weights
+ {
+ size_t total_size = 0;
+
+ bool has_lm_head = false;
+
+ std::vector<char> read_buf;
+
+ while (true) {
+ int32_t n_dims;
+ int32_t length;
+ int32_t ttype;
+
+ fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
+ fin.read(reinterpret_cast<char *>(&length), sizeof(length));
+ fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
+
+ if (fin.eof()) {
+ break;
+ }
+
+ int32_t nelements = 1;
+ int32_t ne[2] = { 1, 1 };
+ for (int i = 0; i < n_dims; ++i) {
+ fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
+ nelements *= ne[i];
+ }
+
+ std::string name(length, 0);
+ fin.read(&name[0], length);
+
+ if (model.tensors.find(name) == model.tensors.end()) {
+ fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
+ return false;
+ }
+
+ auto tensor = model.tensors[name];
+ ggml_set_name(tensor, name.c_str());
+ if (ggml_nelements(tensor) != nelements) {
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
+ return false;
+ }
+
+ if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
+ fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
+ __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
+ return false;
+ }
+
+ // for debugging
+ if (0) {
+ printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
+ }
+
+ const size_t bpe = ggml_type_size(ggml_type(ttype));
+
+ if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
+ fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
+ __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
+ return false;
+ }
+
+ // allocate the tensor
+ ggml_backend_t backend = tensor_backends[name];
+ ggml_tallocr * alloc = backend_buffers.find(backend)->second.get();
+ ggml_tallocr_alloc(alloc, tensor);
+ //printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());
+
+ if (ggml_backend_is_cpu(backend)
+#ifdef GGML_USE_METAL
+ || ggml_backend_is_metal(backend)
+#endif
+ ) {
+ // for the CPU and Metal backend, we can read directly into the tensor
+ fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+ } else {
+ // read into a temporary buffer first, then copy to device memory
+ read_buf.resize(ggml_nbytes(tensor));
+ fin.read(read_buf.data(), ggml_nbytes(tensor));
+ ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
+ }
+
+ // GPT-2 models share the WTE tensor as the LM head
+ if (name == "model/wte" && has_lm_head == false) {
+ ggml_tallocr_alloc(backend_buffers.find(tensor_backends["model/lm_head"])->second.get(), model.lm_head);
+ //printf("%s: [%5.5s] %s (copied)\n", __func__, ggml_backend_name(tensor_backends["model/lm_head"]), "model/lm_head");
+ ggml_backend_tensor_copy(tensor, model.lm_head);
+ total_size += ggml_nbytes(model.lm_head);
+ }
+
+ if (name == "model/lm_head") {
+ has_lm_head = true;
+ }
+
+ total_size += ggml_nbytes(tensor);
+ }
+ printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
+ }
+
+ fin.close();
+
+ // allocate input tensors
+ {
+ model.embd = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model.hparams.n_ctx);
+ model.position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model.hparams.n_ctx);
+
+ ggml_set_name(model.embd, "in/embd");
+ ggml_set_name(model.position, "in/position");
+
+ // add input tensors to cpu backend
+ size_t input_size = ggml_nbytes(model.embd) + ggml_nbytes(model.position);
+
+ // FIXME: use cpu backend after sched impl
+ ggml_backend_t backend_input = params.n_gpu_layers >= model.hparams.n_layer ? backend_gpu : backend_cpu;
+ model.buffer_input = ggml_backend_alloc_buffer(backend_input, input_size + 512*3);
+ printf("%s: backend_in = %s (%zu bytes)\n", __func__, ggml_backend_name(backend_input), input_size);
+
+ // allocate the tensors into the backend buffer
+ ggml_tallocr * alloc = ggml_tallocr_new(model.buffer_input);
+ ggml_tallocr_alloc(alloc, model.embd);
+ ggml_tallocr_alloc(alloc, model.position);
+ ggml_tallocr_free(alloc);
+ }
+
+ return true;
+}
+
+// build the computation graph
+struct ggml_cgraph * gpt2_graph(
+ const gpt2_model & model,
+ const int n_past,
+ const std::vector<gpt_vocab::id> & embd_inp) {
+ const int N = embd_inp.size();
+
+ const auto & hparams = model.hparams;
+
+ const int n_embd = hparams.n_embd;
+ const int n_layer = hparams.n_layer;
+ const int n_ctx = hparams.n_ctx;
+ const int n_head = hparams.n_head;
+
+ // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
+ static size_t buf_size = ggml_tensor_overhead()*GPT2_MAX_NODES + ggml_graph_overhead_custom(GPT2_MAX_NODES, false);
+ static std::vector<uint8_t> buf(buf_size);
+
+ struct ggml_init_params params = {
+ /*.mem_size =*/ buf_size,
+ /*.mem_buffer =*/ buf.data(),
+ /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
+ };
+
+ struct ggml_context * ctx0 = ggml_init(params);
+
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false);
+
+ struct ggml_tensor * embd = ggml_view_1d(ctx0, model.embd, N, 0);
+
+ // set inputs
+ // TODO: move to gpt2_eval
+ ggml_backend_tensor_set(model.embd, embd_inp.data(), 0, N*ggml_element_size(embd));
+
+ struct ggml_tensor * position = ggml_view_1d(ctx0, model.position, N, 0);
+ for (int i = 0; i < N; ++i) {
+ int32_t v = n_past + i;
+ ggml_backend_tensor_set(model.position, &v, i*sizeof(int32_t), sizeof(v));
+ }
+
+ const float KQ_scale = 1.0f/sqrtf(float(model.hparams.n_embd)/model.hparams.n_head);
+
+ // wte + wpe
+ struct ggml_tensor * inpL =
+ ggml_add(ctx0,
+ ggml_get_rows(ctx0, model.wte, embd),
+ ggml_get_rows(ctx0, model.wpe, position));
+ ggml_set_name(inpL, "inpL");
+ ggml_set_name(inpL->src[0], "wte");
+ ggml_set_name(inpL->src[1], "wpe");
+
+ for (int il = 0; il < n_layer; ++il) {
+ struct ggml_tensor * cur;
+
+ // norm
+ {
+ // [ 768, N]
+ cur = ggml_norm(ctx0, inpL, hparams.eps);
+ ggml_format_name(cur, "l%d.norm", il);
+
+ // cur = ln_1_g*cur + ln_1_b
+ // [ 768, N]
+ cur = ggml_add(ctx0,
+ ggml_mul(ctx0,
+ cur,
+ model.layers[il].ln_1_g),
+ model.layers[il].ln_1_b);
+ ggml_format_name(cur, "l%d.ln_1_b", il);
+ ggml_format_name(cur->src[0], "l%d.ln_1_g", il);
+ }
+
+ // attn
+ // [2304, 768] - model.layers[il].c_attn_attn_w
+ // [2304, 1] - model.layers[il].c_attn_attn_b
+ // [ 768, N] - cur (in)
+ // [2304, N] - cur (out)
+ //
+ // cur = attn_w*cur + attn_b
+ // [2304, N]
+ {
+ cur = ggml_mul_mat(ctx0,
+ model.layers[il].c_attn_attn_w,
+ cur);
+ ggml_format_name(cur, "l%d.attn_w", il);
+
+ cur = ggml_add(ctx0,
+ cur,
+ model.layers[il].c_attn_attn_b);
+ ggml_format_name(cur, "l%d.attn_b", il);
+ }
+
+ // self-attention
+ {
+ struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+ struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+ struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+
+ ggml_format_name(Qcur, "l%d.Qcur", il);
+ ggml_format_name(Kcur, "l%d.Kcur", il);
+ ggml_format_name(Vcur, "l%d.Vcur", il);
+
+ // store key and value to memory
+ if (N >= 1) {
+ struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+ struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+ }
+
+ // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
+ // [64, N, 12]
+ struct ggml_tensor * Q =
+ ggml_permute(ctx0,
+ ggml_cpy(ctx0,
+ Qcur,
+ ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+ 0, 2, 1, 3);
+ ggml_format_name(Q, "l%d.Q", il);
+
+ // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
+ // [64, n_past + N, 12]
+ struct ggml_tensor * K =
+ ggml_permute(ctx0,
+ ggml_reshape_3d(ctx0,
+ ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+ n_embd/n_head, n_head, n_past + N),
+ 0, 2, 1, 3);
+ ggml_format_name(K, "l%d.K", il);
+
+ // GG: flash attention
+ //struct ggml_tensor * V =
+ // ggml_cpy(ctx0,
+ // ggml_permute(ctx0,
+ // ggml_reshape_3d(ctx0,
+ // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+ // n_embd/n_head, n_head, n_past + N),
+ // 1, 2, 0, 3),
+ // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
+
+ //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
+
+ // K * Q
+ // [n_past + N, N, 12]
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+ ggml_format_name(KQ, "l%d.KQ", il);
+
+ // KQ_scaled = KQ / sqrt(n_embd/n_head)
+ // [n_past + N, N, 12]
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
+ ggml_format_name(KQ_scaled, "l%d.KQ_scaled", il);
+
+ // KQ_masked = mask_past(KQ_scaled)
+ // [n_past + N, N, 12]
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+ ggml_format_name(KQ_masked, "l%d.KQ_masked", il);
+
+ // KQ = soft_max(KQ_masked)
+ // [n_past + N, N, 12]
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+ ggml_format_name(KQ_soft_max, "l%d.KQ_soft_max", il);
+
+ // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
+ // [n_past + N, 64, 12]
+ struct ggml_tensor * V_trans =
+ ggml_cpy(ctx0,
+ ggml_permute(ctx0,
+ ggml_reshape_3d(ctx0,
+ ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+ n_embd/n_head, n_head, n_past + N),
+ 1, 2, 0, 3),
+ ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
+ ggml_format_name(V_trans, "l%d.V_trans", il);
+
+ // KQV = transpose(V) * KQ_soft_max
+ // [64, N, 12]
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+ ggml_format_name(KQV, "l%d.KQV", il);
+
+ // KQV_merged = KQV.permute(0, 2, 1, 3)
+ // [64, 12, N]
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+ ggml_format_name(KQV_merged, "l%d.KQV_merged", il);
+
+ // cur = KQV_merged.contiguous().view(n_embd, N)
+ // [768, N]
+ cur = ggml_cpy(ctx0,
+ KQV_merged,
+ ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+ ggml_format_name(cur, "l%d.KQV_merged_contiguous", il);
+ }
+
+ // projection
+ // [ 768, 768] - model.layers[il].c_attn_proj_w
+ // [ 768, 1] - model.layers[il].c_attn_proj_b
+ // [ 768, N] - cur (in)
+ // [ 768, N] - cur (out)
+ //
+ // cur = proj_w*cur + proj_b
+ // [768, N]
+ {
+ cur = ggml_mul_mat(ctx0,
+ model.layers[il].c_attn_proj_w,
+ cur);
+ ggml_format_name(cur, "l%d.attn_proj_w", il);
+
+ cur = ggml_add(ctx0,
+ cur,
+ model.layers[il].c_attn_proj_b);
+ ggml_format_name(cur, "l%d.attn_proj_b", il);
+ }
+
+ // add the input
+ cur = ggml_add(ctx0, cur, inpL);
+ ggml_format_name(cur, "l%d.add", il);
+
+ struct ggml_tensor * inpFF = cur;
+
+ // feed-forward network
+ {
+ // norm
+ {
+ cur = ggml_norm(ctx0, inpFF, hparams.eps);
+ ggml_format_name(cur, "l%d.FFnorm", il);
+
+ // cur = ln_2_g*cur + ln_2_b
+ // [ 768, N]
+ cur = ggml_add(ctx0,
+ ggml_mul(ctx0,
+ cur,
+ model.layers[il].ln_2_g),
+ model.layers[il].ln_2_b);
+ ggml_format_name(cur, "l%d.ln_2_b", il);
+ ggml_format_name(cur->src[0], "l%d.ln_2_g", il);
+ }
+
+ // fully connected
+ // [3072, 768] - model.layers[il].c_mlp_fc_w
+ // [3072, 1] - model.layers[il].c_mlp_fc_b
+ // [ 768, N] - cur (in)
+ // [3072, N] - cur (out)
+ //
+ // cur = fc_w*cur + fc_b
+ // [3072, N]
+ cur = ggml_mul_mat(ctx0,
+ model.layers[il].c_mlp_fc_w,
+ cur);
+ ggml_format_name(cur, "l%d.mlp_fc_w", il);
+
+ cur = ggml_add(ctx0,
+ cur,
+ model.layers[il].c_mlp_fc_b);
+ ggml_format_name(cur, "l%d.mlp_fc_b", il);
+
+ // GELU activation
+ // [3072, N]
+ cur = ggml_gelu(ctx0, cur);
+ ggml_format_name(cur, "l%d.gelu", il);
+
+ // projection
+ // [ 768, 3072] - model.layers[il].c_mlp_proj_w
+ // [ 768, 1] - model.layers[il].c_mlp_proj_b
+ // [3072, N] - cur (in)
+ // [ 768, N] - cur (out)
+ //
+ // cur = proj_w*cur + proj_b
+ // [768, N]
+ cur = ggml_mul_mat(ctx0,
+ model.layers[il].c_mlp_proj_w,
+ cur);
+ ggml_format_name(cur, "l%d.mlp_proj_w", il);
+
+ cur = ggml_add(ctx0,
+ cur,
+ model.layers[il].c_mlp_proj_b);
+ ggml_format_name(cur, "l%d.mlp_proj_b", il);
+ }
+
+ // input for next layer
+ inpL = ggml_add(ctx0, cur, inpFF);
+ ggml_format_name(inpL, "l%d.add2", il);
+ }
+
+ // norm
+ {
+ // [ 768, N]
+ inpL = ggml_norm(ctx0, inpL, hparams.eps);
+ ggml_format_name(inpL, "out_norm");
+
+ // inpL = ln_f_g*inpL + ln_f_b
+ // [ 768, N]
+ inpL = ggml_add(ctx0,
+ ggml_mul(ctx0,
+ inpL,
+ model.ln_f_g),
+ model.ln_f_b);
+ ggml_format_name(inpL, "out_ln_f_b");
+ ggml_format_name(inpL->src[0], "out_ln_f_g");
+ }
+
+ // inpL = WTE * inpL
+ // [ 768, 50257] - model.lm_head
+ // [ 768, N] - inpL
+ inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+ ggml_format_name(inpL, "out_lm_head");
+
+ // logits -> probs
+ //inpL = ggml_soft_max(ctx0, inpL);
+
+ ggml_build_forward_expand(gf, inpL);
+
+ ggml_free(ctx0);
+
+ return gf;
+}
+
+// evaluate the transformer
+//
+// - model: the model
+// - sched: the backend scheduler
+// - n_past: the context size so far
+// - embd_inp: the embeddings of the tokens in the context
+// - embd_w: the predicted logits for the next token
+//
+bool gpt2_eval(
+ const gpt2_model & model,
+ ggml_backend_sched_t sched,
+ const int n_past,
+ const std::vector<gpt_vocab::id> & embd_inp,
+ std::vector<float> & embd_w) {
+ const int N = embd_inp.size();
+
+ const auto & hparams = model.hparams;
+
+ const int n_vocab = hparams.n_vocab;
+
+ struct ggml_cgraph * gf = gpt2_graph(model, n_past, embd_inp);
+
+ // run the computation
+ ggml_backend_sched_graph_compute(sched, gf);
+
+ //if (n_past%100 == 0) {
+ // ggml_graph_print (&gf);
+ // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
+ //}
+
+ // in this case, the output tensor is the last one in the graph
+ struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
+
+ //embd_w.resize(n_vocab*N);
+ //ggml_backend_tensor_get(inpL, embd_w.data(), 0, sizeof(float)*n_vocab*N);
+
+ // return result just for the last token
+ embd_w.resize(n_vocab);
+ ggml_backend_tensor_get(inpL, embd_w.data(), (n_vocab*(N-1))*sizeof(float), sizeof(float)*n_vocab);
+
+ return true;
+}
+
+int main(int argc, char ** argv) {
+ ggml_time_init();
+
+ const int64_t t_main_start_us = ggml_time_us();
+
+ gpt_params params;
+ params.model = "models/gpt-2-117M/ggml-model.bin";
+
+ if (gpt_params_parse(argc, argv, params) == false) {
+ return 1;
+ }
+
+ if (params.seed < 0) {
+ params.seed = time(NULL);
+ }
+
+ printf("%s: seed = %d\n", __func__, params.seed);
+
+ std::mt19937 rng(params.seed);
+ if (params.prompt.empty()) {
+ params.prompt = gpt_random_prompt(rng);
+ }
+
+ int64_t t_load_us = 0;
+
+ gpt_vocab vocab;
+ gpt2_model model;
+
+ // load the model
+ {
+ const int64_t t_start_us = ggml_time_us();
+
+ if (!gpt2_model_load(params.model, model, vocab, params)) {
+ fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
+ return 1;
+ }
+
+ t_load_us = ggml_time_us() - t_start_us;
+
+ test_gpt_tokenizer(vocab, params.token_test);
+ }
+
+ // create the backend scheduler
+ // the scheduler handles the allocation of the compute buffers and the scheduling of the computation between the different backends
+ ggml_backend_sched_t sched;
+ {
+ // initialize the scheduler
+ sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES);
+
+ // create the worst case graph for memory usage estimation
+ int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
+ int n_past = model.hparams.n_ctx - n_tokens;
+ struct ggml_cgraph * gf = gpt2_graph(model, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
+
+ ggml_backend_sched_reserve(sched, gf);
+
+
+ // compute the required memory
+ size_t mem_size = 0;
+ for (size_t i = 0; i < model.backends.size(); i++) {
+ size_t size = ggml_backend_sched_get_buffer_size(sched, model.backends[i]);
+ if (size > 0) {
+ mem_size += size;
+ printf("%s: %8s compute buffer size = %8.2f MB\n", __func__, ggml_backend_name(model.backends[i]), size/1024.0/1024.0);
+ //printf("%s: %8s compute buffer size = %zu bytes\n", __func__, ggml_backend_name(model.backends[i]), size);
+ }
+ }
+
+ printf("%s: total compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
+ }
+
+ int n_past = 0;
+
+ int64_t t_sample_us = 0;
+ int64_t t_predict_us = 0;
+
+ std::vector<float> logits;
+
+ // tokenize the prompt
+ std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
+
+ params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
+
+ printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
+ printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
+ for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
+ printf("%d ", embd_inp[i]);
+ }
+ printf("\n\n");
+
+ // submit the input prompt token-by-token
+ // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
+ std::vector<gpt_vocab::id> embd;
+
+ for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
+ // predict
+ if (embd.size() > 0) {
+ const int64_t t_start_us = ggml_time_us();
+
+ if (!gpt2_eval(model, sched, n_past, embd, logits)) {
+ printf("Failed to predict\n");
+ return 1;
+ }
+
+ t_predict_us += ggml_time_us() - t_start_us;
+ }
+
+ n_past += embd.size();
+ embd.clear();
+
+ if (i >= embd_inp.size()) {
+ // sample next token
+ const int top_k = params.top_k;
+ const float top_p = params.top_p;
+ const float temp = params.temp;
+
+ const int n_vocab = model.hparams.n_vocab;
+
+ gpt_vocab::id id = 0;
+
+ {
+ const int64_t t_start_sample_us = ggml_time_us();
+
+ id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
+
+ t_sample_us += ggml_time_us() - t_start_sample_us;
+ }
+
+ // add it to the context
+ embd.push_back(id);
+ } else {
+ // if here, it means we are still processing the input prompt
+ for (size_t k = i; k < embd_inp.size(); k++) {
+ embd.push_back(embd_inp[k]);
+ if (int32_t(embd.size()) >= params.n_batch) {
+ break;
+ }
+ }
+ i += embd.size() - 1;
+ }
+
+ // display text
+ for (auto id : embd) {
+ printf("%s", vocab.id_to_token[id].c_str());
+ }
+ fflush(stdout);
+
+ // end of text token
+ if (embd.back() == 50256) {
+ break;
+ }
+ }
+
+ // report timing
+ {
+ const int64_t t_main_end_us = ggml_time_us();
+
+ printf("\n\n");
+ printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
+ printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
+ printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
+ printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
+ }
+
+ ggml_free(model.ctx_w);
+
+ ggml_backend_sched_free(sched);
+ ggml_backend_buffer_free(model.buffer_kv);
+ for (auto buf : model.buffers_w) {
+ ggml_backend_buffer_free(buf);
+ }
+ for (auto backend : model.backends) {
+ ggml_backend_free(backend);
+ }
+
+ return 0;
+}
+++ /dev/null
-#include "ggml/ggml.h"
-#include "ggml/ggml-alloc.h"
-#include "ggml/ggml-backend.h"
-
-#ifdef GGML_USE_CUBLAS
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
-#include "common.h"
-#include "common-ggml.h"
-
-#include <cassert>
-#include <cmath>
-#include <cstdio>
-#include <cstring>
-#include <fstream>
-#include <map>
-#include <string>
-#include <vector>
-
-#if defined(_MSC_VER)
-#pragma warning(disable: 4244 4267) // possible loss of data
-#endif
-
-#define GPT2_MAX_NODES 4096
-
-static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
- (void) level;
- (void) user_data;
- fputs(text, stderr);
- fflush(stderr);
-}
-
-// default hparams (GPT-2 117M)
-struct gpt2_hparams {
- int32_t n_vocab = 50257;
- int32_t n_ctx = 1024;
- int32_t n_embd = 768;
- int32_t n_head = 12;
- int32_t n_layer = 12;
- int32_t ftype = 1;
- float eps = 1e-5f;
-};
-
-struct gpt2_layer {
- // normalization
- struct ggml_tensor * ln_1_g;
- struct ggml_tensor * ln_1_b;
-
- struct ggml_tensor * ln_2_g;
- struct ggml_tensor * ln_2_b;
-
- // attention
- struct ggml_tensor * c_attn_attn_w;
- struct ggml_tensor * c_attn_attn_b;
-
- struct ggml_tensor * c_attn_proj_w;
- struct ggml_tensor * c_attn_proj_b;
-
- // mlp
- struct ggml_tensor * c_mlp_fc_w;
- struct ggml_tensor * c_mlp_fc_b;
-
- struct ggml_tensor * c_mlp_proj_w;
- struct ggml_tensor * c_mlp_proj_b;
-};
-
-struct gpt2_model {
- gpt2_hparams hparams;
-
- // normalization
- struct ggml_tensor * ln_f_g;
- struct ggml_tensor * ln_f_b;
-
- struct ggml_tensor * wte; // position embedding
- struct ggml_tensor * wpe; // token embedding
- struct ggml_tensor * lm_head; // language model head
-
- std::vector<gpt2_layer> layers;
-
- // key + value memory
- struct ggml_tensor * memory_k;
- struct ggml_tensor * memory_v;
-
- //
- struct ggml_context * ctx;
-
- std::vector<ggml_backend_t> backends;
- std::vector<ggml_backend_buffer_t> buffers_w;
- ggml_backend_buffer_t buffer_kv;
- ggml_backend_buffer_t buffer_input;
-
- std::map<std::string, struct ggml_tensor *> tensors;
-
- // inputs/constants
- struct ggml_tensor * embd;
- struct ggml_tensor * position;
-};
-
-void init_backends(gpt2_model & model, const gpt_params & params) {
- ggml_backend_t gpu_backend = NULL;
-
- // initialize the backends
-#ifdef GGML_USE_CUBLAS
- if (params.n_gpu_layers > 0) {
- fprintf(stderr, "%s: using CUDA backend\n", __func__);
- gpu_backend = ggml_backend_cuda_init(0);
- if (!gpu_backend) {
- fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
- }
- }
-#endif
-
-#ifdef GGML_USE_METAL
- if (params.n_gpu_layers > 0) {
- fprintf(stderr, "%s: using Metal backend\n", __func__);
- ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
- gpu_backend = ggml_backend_metal_init();
- if (!gpu_backend) {
- fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
- } else {
- ggml_backend_metal_set_n_cb(gpu_backend, params.n_threads);
- }
- }
-#endif
- if (gpu_backend) {
- model.backends.push_back(gpu_backend);
- }
-
- // always add the CPU backend as a fallback
- ggml_backend_t cpu_backend = ggml_backend_cpu_init();
- ggml_backend_cpu_set_n_threads(cpu_backend, params.n_threads);
- model.backends.push_back(cpu_backend);
-}
-
-// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, const gpt_params & params) {
- printf("%s: loading model from '%s'\n", __func__, fname.c_str());
-
- auto fin = std::ifstream(fname, std::ios::binary);
- if (!fin) {
- fprintf(stderr, "%s: failed to open '%s'\n", __func__, fname.c_str());
- return false;
- }
-
- // verify magic
- {
- uint32_t magic;
- fin.read((char *) &magic, sizeof(magic));
- if (magic != GGML_FILE_MAGIC) {
- fprintf(stderr, "%s: invalid model file '%s' (bad magic)\n", __func__, fname.c_str());
- return false;
- }
- }
-
- // load hparams
- {
- auto & hparams = model.hparams;
-
- fin.read((char *) &hparams.n_vocab, sizeof(hparams.n_vocab));
- fin.read((char *) &hparams.n_ctx, sizeof(hparams.n_ctx));
- fin.read((char *) &hparams.n_embd, sizeof(hparams.n_embd));
- fin.read((char *) &hparams.n_head, sizeof(hparams.n_head));
- fin.read((char *) &hparams.n_layer, sizeof(hparams.n_layer));
- fin.read((char *) &hparams.ftype, sizeof(hparams.ftype));
-
- const int32_t qntvr = hparams.ftype / GGML_QNT_VERSION_FACTOR;
-
- printf("%s: n_vocab = %d\n", __func__, hparams.n_vocab);
- printf("%s: n_ctx = %d\n", __func__, hparams.n_ctx);
- printf("%s: n_embd = %d\n", __func__, hparams.n_embd);
- printf("%s: n_head = %d\n", __func__, hparams.n_head);
- printf("%s: n_layer = %d\n", __func__, hparams.n_layer);
- printf("%s: ftype = %d\n", __func__, hparams.ftype);
- printf("%s: qntvr = %d\n", __func__, qntvr);
-
- hparams.ftype %= GGML_QNT_VERSION_FACTOR;
- }
-
- // load vocab
- {
- int32_t n_vocab = 0;
- fin.read((char *) &n_vocab, sizeof(n_vocab));
-
- if (n_vocab != model.hparams.n_vocab) {
- fprintf(stderr, "%s: invalid model file '%s' (bad vocab size %d != %d)\n",
- __func__, fname.c_str(), n_vocab, model.hparams.n_vocab);
- return false;
- }
-
- std::string word;
- std::vector<char> buf(128);
-
- for (int i = 0; i < n_vocab; i++) {
- uint32_t len;
- fin.read((char *) &len, sizeof(len));
-
- buf.resize(len);
- fin.read((char *) buf.data(), len);
- word.assign(buf.data(), len);
-
- vocab.token_to_id[word] = i;
- vocab.id_to_token[i] = word;
- }
- }
-
- // for the big tensors, we have the option to store the data in 16-bit floats or quantized
- // in order to save memory and also to speed up the computation
- ggml_type wtype = ggml_ftype_to_ggml_type((ggml_ftype) (model.hparams.ftype));
- if (wtype == GGML_TYPE_COUNT) {
- fprintf(stderr, "%s: invalid model file '%s' (bad ftype value %d)\n",
- __func__, fname.c_str(), model.hparams.ftype);
- return false;
- }
-
- auto & ctx = model.ctx;
-
- // create the ggml context
- {
- size_t n_tensors = 3 /* input */ + 2 /* kv */ + 6 + 12*model.hparams.n_layer;
- struct ggml_init_params params = {
- /*.mem_size =*/ ggml_tensor_overhead() * n_tensors,
- /*.mem_buffer =*/ NULL,
- /*.no_alloc =*/ true,
- };
-
- model.ctx = ggml_init(params);
- if (!model.ctx) {
- fprintf(stderr, "%s: ggml_init() failed\n", __func__);
- return false;
- }
- }
-
- // create tensors for the weights
- {
- const auto & hparams = model.hparams;
-
- const int n_embd = hparams.n_embd;
- const int n_layer = hparams.n_layer;
- const int n_ctx = hparams.n_ctx;
- const int n_vocab = hparams.n_vocab;
-
- model.layers.resize(n_layer);
-
- model.ln_f_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
- model.ln_f_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
- model.wte = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
- model.wpe = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_ctx);
- model.lm_head = ggml_new_tensor_2d(ctx, wtype, n_embd, n_vocab);
-
- // map by name
- model.tensors["model/ln_f/g"] = model.ln_f_g;
- model.tensors["model/ln_f/b"] = model.ln_f_b;
-
- model.tensors["model/wte"] = model.wte;
- model.tensors["model/wpe"] = model.wpe;
- model.tensors["model/lm_head"] = model.lm_head;
-
- for (int i = 0; i < n_layer; ++i) {
- auto & layer = model.layers[i];
-
- layer.ln_1_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
- layer.ln_1_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
- layer.ln_2_g = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
- layer.ln_2_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
- layer.c_attn_attn_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 3*n_embd);
- layer.c_attn_attn_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 3*n_embd);
-
- layer.c_attn_proj_w = ggml_new_tensor_2d(ctx, wtype, n_embd, n_embd);
- layer.c_attn_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
- layer.c_mlp_fc_w = ggml_new_tensor_2d(ctx, wtype, n_embd, 4*n_embd);
- layer.c_mlp_fc_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, 4*n_embd);
-
- layer.c_mlp_proj_w = ggml_new_tensor_2d(ctx, wtype, 4*n_embd, n_embd);
- layer.c_mlp_proj_b = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_embd);
-
- // map by name
- model.tensors["model/h" + std::to_string(i) + "/ln_1/g"] = layer.ln_1_g;
- model.tensors["model/h" + std::to_string(i) + "/ln_1/b"] = layer.ln_1_b;
-
- model.tensors["model/h" + std::to_string(i) + "/ln_2/g"] = layer.ln_2_g;
- model.tensors["model/h" + std::to_string(i) + "/ln_2/b"] = layer.ln_2_b;
-
- model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/w"] = layer.c_attn_attn_w;
- model.tensors["model/h" + std::to_string(i) + "/attn/c_attn/b"] = layer.c_attn_attn_b;
-
- model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/w"] = layer.c_attn_proj_w;
- model.tensors["model/h" + std::to_string(i) + "/attn/c_proj/b"] = layer.c_attn_proj_b;
-
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/w"] = layer.c_mlp_fc_w;
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_fc/b"] = layer.c_mlp_fc_b;
-
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/w"] = layer.c_mlp_proj_w;
- model.tensors["model/h" + std::to_string(i) + "/mlp/c_proj/b"] = layer.c_mlp_proj_b;
- }
- }
-
- // assign tensors to backends
- init_backends(model, params);
- ggml_backend_t backend_gpu = model.backends.front();
- ggml_backend_t backend_cpu = model.backends.back();
- std::map<std::string, ggml_backend_t> tensor_backends;
- {
- const int i_gpu_first_layer = model.hparams.n_layer - params.n_gpu_layers;
- for (auto it : model.tensors) {
- const std::string & name = it.first;
- // input tensors
- if (name == "model/wte" || name == "model/wpe") {
- if (params.n_gpu_layers > model.hparams.n_layer) {
- tensor_backends[name] = backend_gpu;
- } else {
- tensor_backends[name] = backend_cpu;
- }
- }
- // output tensors
- if (name == "model/ln_f/g" || name == "model/ln_f/b" || name == "model/lm_head") {
- if (params.n_gpu_layers > 0) {
- tensor_backends[name] = backend_gpu;
- } else {
- tensor_backends[name] = backend_cpu;
- }
- }
- // layer tensors
- if (name.substr(0, 7) == "model/h") {
- // parse layer number
- int layer = std::stoi(name.substr(7, 2));
- if (layer >= i_gpu_first_layer) {
- tensor_backends[name] = backend_gpu;
- } else {
- tensor_backends[name] = backend_cpu;
- }
- }
- }
- }
-
- // allocate buffers
- std::map<ggml_backend_t, std::unique_ptr<ggml_allocr, decltype(&ggml_allocr_free)>> backend_buffers;
- for (auto backend : model.backends) {
- // compute the size of the buffer
- size_t size = 0;
- for (auto it : model.tensors) {
- if (tensor_backends[it.first] == backend) {
- size += ggml_nbytes(it.second) + 512;
- }
- }
- if (size > 0) {
- printf("%s: %8s buffer size = %8.2f MB\n", __func__, ggml_backend_name(backend), size/1024.0/1024.0);
- // allocate the buffer
- ggml_backend_buffer_t buffer = ggml_backend_alloc_buffer(backend, size);
- model.buffers_w.push_back(buffer);
-
- // create an allocator for the buffer to allocate the tensors
- auto alloc = std::unique_ptr<ggml_allocr, decltype(&ggml_allocr_free)>(ggml_allocr_new_from_buffer(buffer), ggml_allocr_free);
- backend_buffers.insert(std::make_pair(backend, std::move(alloc)));
- } else {
- model.buffers_w.push_back(NULL);
- }
- }
-
- // allocate key + value memory
- {
- const auto & hparams = model.hparams;
-
- const int n_embd = hparams.n_embd;
- const int n_layer = hparams.n_layer;
- const int n_ctx = hparams.n_ctx;
-
- const int n_mem = n_layer*n_ctx;
- const int n_elements = n_embd*n_mem;
-
- model.memory_k = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
- model.memory_v = ggml_new_tensor_1d(ctx, GGML_TYPE_F32, n_elements);
-
- ggml_set_name(model.memory_k, "model/memory_k");
- ggml_set_name(model.memory_v, "model/memory_v");
-
- const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
-
- printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
-
- // create a backend buffer (can be in host or device memory)
- ggml_backend_t backend_kv = params.n_gpu_layers >= hparams.n_layer/2 ? backend_gpu : backend_cpu;
- printf("%s: backend_kv = %s\n", __func__, ggml_backend_name(backend_kv));
- model.buffer_kv = ggml_backend_alloc_buffer(backend_kv, memory_size + 512*2);
-
- // allocate the tensors into the backend buffer
- {
- ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv);
-
- // this updates the pointers in the tensors to point to the correct location in the buffer
- // this is necessary since the ggml_context is .no_alloc == true
- // note that the buffer can actually be a device buffer, depending on the backend
- ggml_allocr_alloc(alloc, model.memory_k);
- ggml_allocr_alloc(alloc, model.memory_v);
-
- ggml_allocr_free(alloc);
- }
- }
-
- // load weights
- {
- size_t total_size = 0;
-
- bool has_lm_head = false;
-
- std::vector<char> read_buf;
-
- while (true) {
- int32_t n_dims;
- int32_t length;
- int32_t ttype;
-
- fin.read(reinterpret_cast<char *>(&n_dims), sizeof(n_dims));
- fin.read(reinterpret_cast<char *>(&length), sizeof(length));
- fin.read(reinterpret_cast<char *>(&ttype), sizeof(ttype));
-
- if (fin.eof()) {
- break;
- }
-
- int32_t nelements = 1;
- int32_t ne[2] = { 1, 1 };
- for (int i = 0; i < n_dims; ++i) {
- fin.read(reinterpret_cast<char *>(&ne[i]), sizeof(ne[i]));
- nelements *= ne[i];
- }
-
- std::string name(length, 0);
- fin.read(&name[0], length);
-
- if (model.tensors.find(name) == model.tensors.end()) {
- fprintf(stderr, "%s: unknown tensor '%s' in model file\n", __func__, name.c_str());
- return false;
- }
-
- auto tensor = model.tensors[name];
- ggml_set_name(tensor, name.c_str());
- if (ggml_nelements(tensor) != nelements) {
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
- return false;
- }
-
- if (tensor->ne[0] != ne[0] || tensor->ne[1] != ne[1]) {
- fprintf(stderr, "%s: tensor '%s' has wrong shape in model file: got [%d, %d], expected [%d, %d]\n",
- __func__, name.c_str(), (int) tensor->ne[0], (int) tensor->ne[1], ne[0], ne[1]);
- return false;
- }
-
- // for debugging
- if (0) {
- printf("%24s - [%5d, %5d], type = %6s, %6.2f MB, %9zu bytes\n", name.c_str(), ne[0], ne[1], ggml_type_name(ggml_type(ttype)), ggml_nbytes(tensor)/1024.0/1024.0, ggml_nbytes(tensor));
- }
-
- const size_t bpe = ggml_type_size(ggml_type(ttype));
-
- if ((nelements*bpe)/ggml_blck_size(tensor->type) != ggml_nbytes(tensor)) {
- fprintf(stderr, "%s: tensor '%s' has wrong size in model file: got %zu, expected %zu\n",
- __func__, name.c_str(), ggml_nbytes(tensor), nelements*bpe);
- return false;
- }
-
- // allocate the tensor
- ggml_backend_t backend = tensor_backends[name];
- ggml_allocr * alloc = backend_buffers.find(backend)->second.get();
- ggml_allocr_alloc(alloc, tensor);
- //printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());
-
- if (ggml_backend_is_cpu(backend)
-#ifdef GGML_USE_METAL
- || ggml_backend_is_metal(backend)
-#endif
- ) {
- // for the CPU and Metal backend, we can read directly into the tensor
- fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
- } else {
- // read into a temporary buffer first, then copy to device memory
- read_buf.resize(ggml_nbytes(tensor));
- fin.read(read_buf.data(), ggml_nbytes(tensor));
- ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
- }
-
- // GPT-2 models share the WTE tensor as the LM head
- if (name == "model/wte" && has_lm_head == false) {
- ggml_allocr_alloc(backend_buffers.find(tensor_backends["model/lm_head"])->second.get(), model.lm_head);
- //printf("%s: [%5.5s] %s (copied)\n", __func__, ggml_backend_name(tensor_backends["model/lm_head"]), "model/lm_head");
- ggml_backend_tensor_copy(tensor, model.lm_head);
- total_size += ggml_nbytes(model.lm_head);
- }
-
- if (name == "model/lm_head") {
- has_lm_head = true;
- }
-
- total_size += ggml_nbytes(tensor);
- }
- printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
- }
-
- fin.close();
-
- // allocate input tensors
- {
- model.embd = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model.hparams.n_ctx);
- model.position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, model.hparams.n_ctx);
-
- ggml_set_name(model.embd, "in/embd");
- ggml_set_name(model.position, "in/position");
-
- // add input tensors to cpu backend
- size_t input_size = ggml_nbytes(model.embd) + ggml_nbytes(model.position);
-
- // FIXME: use cpu backend after sched impl
- ggml_backend_t backend_input = params.n_gpu_layers >= model.hparams.n_layer ? backend_gpu : backend_cpu;
- model.buffer_input = ggml_backend_alloc_buffer(backend_input, input_size + 512*3);
- printf("%s: backend_in = %s (%zu bytes)\n", __func__, ggml_backend_name(backend_input), input_size);
-
- // allocate the tensors into the backend buffer
- ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_input);
- ggml_allocr_alloc(alloc, model.embd);
- ggml_allocr_alloc(alloc, model.position);
- ggml_allocr_free(alloc);
- }
-
- return true;
-}
-
-// build the computation graph
-struct ggml_cgraph * gpt2_graph(
- const gpt2_model & model,
- const int n_past,
- const std::vector<gpt_vocab::id> & embd_inp) {
- const int N = embd_inp.size();
-
- const auto & hparams = model.hparams;
-
- const int n_embd = hparams.n_embd;
- const int n_layer = hparams.n_layer;
- const int n_ctx = hparams.n_ctx;
- const int n_head = hparams.n_head;
-
- // since we are using ggml-alloc, this buffer only needs enough space to hold the ggml_tensor and ggml_cgraph structs, but not the tensor data
- static size_t buf_size = ggml_tensor_overhead()*GPT2_MAX_NODES + ggml_graph_overhead_custom(GPT2_MAX_NODES, false);
- static std::vector<uint8_t> buf(buf_size);
-
- struct ggml_init_params params = {
- /*.mem_size =*/ buf_size,
- /*.mem_buffer =*/ buf.data(),
- /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
- };
-
- struct ggml_context * ctx0 = ggml_init(params);
-
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false);
-
- struct ggml_tensor * embd = ggml_view_1d(ctx0, model.embd, N, 0);
-
- // TODO: avoid writing to tensors if we are only measuring the memory usage
- // not critical, just a minor optimization
-
- //if (!ggml_allocr_is_measure(allocr)) {
- //ggml_backend_tensor_set(embd, embd_inp.data(), 0, N*ggml_element_size(embd));
- ggml_backend_tensor_set(model.embd, embd_inp.data(), 0, N*ggml_element_size(embd)); // FIXME: cannot use the view here because it's not initialized yet (buffer not set), but we should
- //}
- //memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
-
- struct ggml_tensor * position = ggml_view_1d(ctx0, model.position, N, 0);
- //if (!ggml_allocr_is_measure(allocr)) {
- for (int i = 0; i < N; ++i) {
- int32_t v = n_past + i;
- ggml_backend_tensor_set(model.position, &v, i*sizeof(int32_t), sizeof(v)); // FIXME: same
- //((int32_t *) position->data)[i] = n_past + i;
- }
- //}
-
- const float KQ_scale = 1.0f/sqrtf(float(model.hparams.n_embd)/model.hparams.n_head);
-
- // wte + wpe
- struct ggml_tensor * inpL =
- ggml_add(ctx0,
- ggml_get_rows(ctx0, model.wte, embd),
- ggml_get_rows(ctx0, model.wpe, position));
- ggml_set_name(inpL, "inpL");
- ggml_set_name(inpL->src[0], "wte");
- ggml_set_name(inpL->src[1], "wpe");
-
- for (int il = 0; il < n_layer; ++il) {
- struct ggml_tensor * cur;
-
- // norm
- {
- // [ 768, N]
- cur = ggml_norm(ctx0, inpL, hparams.eps);
- ggml_format_name(cur, "l%d.norm", il);
-
- // cur = ln_1_g*cur + ln_1_b
- // [ 768, N]
- cur = ggml_add(ctx0,
- ggml_mul(ctx0,
- cur,
- model.layers[il].ln_1_g),
- model.layers[il].ln_1_b);
- ggml_format_name(cur, "l%d.ln_1_b", il);
- ggml_format_name(cur->src[0], "l%d.ln_1_g", il);
- }
-
- // attn
- // [2304, 768] - model.layers[il].c_attn_attn_w
- // [2304, 1] - model.layers[il].c_attn_attn_b
- // [ 768, N] - cur (in)
- // [2304, N] - cur (out)
- //
- // cur = attn_w*cur + attn_b
- // [2304, N]
- {
- cur = ggml_mul_mat(ctx0,
- model.layers[il].c_attn_attn_w,
- cur);
- ggml_format_name(cur, "l%d.attn_w", il);
-
- cur = ggml_add(ctx0,
- cur,
- model.layers[il].c_attn_attn_b);
- ggml_format_name(cur, "l%d.attn_b", il);
- }
-
- // self-attention
- {
- struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
- struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
- struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
-
- ggml_format_name(Qcur, "l%d.Qcur", il);
- ggml_format_name(Kcur, "l%d.Kcur", il);
- ggml_format_name(Vcur, "l%d.Vcur", il);
-
- // store key and value to memory
- if (N >= 1) {
- struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
- struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
-
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
- }
-
- // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
- // [64, N, 12]
- struct ggml_tensor * Q =
- ggml_permute(ctx0,
- ggml_cpy(ctx0,
- Qcur,
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
- 0, 2, 1, 3);
- ggml_format_name(Q, "l%d.Q", il);
-
- // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
- // [64, n_past + N, 12]
- struct ggml_tensor * K =
- ggml_permute(ctx0,
- ggml_reshape_3d(ctx0,
- ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
- n_embd/n_head, n_head, n_past + N),
- 0, 2, 1, 3);
- ggml_format_name(K, "l%d.K", il);
-
- // GG: flash attention
- //struct ggml_tensor * V =
- // ggml_cpy(ctx0,
- // ggml_permute(ctx0,
- // ggml_reshape_3d(ctx0,
- // ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
- // n_embd/n_head, n_head, n_past + N),
- // 1, 2, 0, 3),
- // ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_past + N, n_embd/n_head, n_head));
-
- //struct ggml_tensor * KQV = ggml_flash_attn(ctx0, Q, K, V, true);
-
- // K * Q
- // [n_past + N, N, 12]
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
- ggml_format_name(KQ, "l%d.KQ", il);
-
- // KQ_scaled = KQ / sqrt(n_embd/n_head)
- // [n_past + N, N, 12]
- struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
- ggml_format_name(KQ_scaled, "l%d.KQ_scaled", il);
-
- // KQ_masked = mask_past(KQ_scaled)
- // [n_past + N, N, 12]
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
- ggml_format_name(KQ_masked, "l%d.KQ_masked", il);
-
- // KQ = soft_max(KQ_masked)
- // [n_past + N, N, 12]
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
- ggml_format_name(KQ_soft_max, "l%d.KQ_soft_max", il);
-
- // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
- // [n_past + N, 64, 12]
- struct ggml_tensor * V_trans =
- ggml_cpy(ctx0,
- ggml_permute(ctx0,
- ggml_reshape_3d(ctx0,
- ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
- n_embd/n_head, n_head, n_past + N),
- 1, 2, 0, 3),
- ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
- ggml_format_name(V_trans, "l%d.V_trans", il);
-
- // KQV = transpose(V) * KQ_soft_max
- // [64, N, 12]
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
- ggml_format_name(KQV, "l%d.KQV", il);
-
- // KQV_merged = KQV.permute(0, 2, 1, 3)
- // [64, 12, N]
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
- ggml_format_name(KQV_merged, "l%d.KQV_merged", il);
-
- // cur = KQV_merged.contiguous().view(n_embd, N)
- // [768, N]
- cur = ggml_cpy(ctx0,
- KQV_merged,
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
- ggml_format_name(cur, "l%d.KQV_merged_contiguous", il);
- }
-
- // projection
- // [ 768, 768] - model.layers[il].c_attn_proj_w
- // [ 768, 1] - model.layers[il].c_attn_proj_b
- // [ 768, N] - cur (in)
- // [ 768, N] - cur (out)
- //
- // cur = proj_w*cur + proj_b
- // [768, N]
- {
- cur = ggml_mul_mat(ctx0,
- model.layers[il].c_attn_proj_w,
- cur);
- ggml_format_name(cur, "l%d.attn_proj_w", il);
-
- cur = ggml_add(ctx0,
- cur,
- model.layers[il].c_attn_proj_b);
- ggml_format_name(cur, "l%d.attn_proj_b", il);
- }
-
- // add the input
- cur = ggml_add(ctx0, cur, inpL);
- ggml_format_name(cur, "l%d.add", il);
-
- struct ggml_tensor * inpFF = cur;
-
- // feed-forward network
- {
- // norm
- {
- cur = ggml_norm(ctx0, inpFF, hparams.eps);
- ggml_format_name(cur, "l%d.FFnorm", il);
-
- // cur = ln_2_g*cur + ln_2_b
- // [ 768, N]
- cur = ggml_add(ctx0,
- ggml_mul(ctx0,
- cur,
- model.layers[il].ln_2_g),
- model.layers[il].ln_2_b);
- ggml_format_name(cur, "l%d.ln_2_b", il);
- ggml_format_name(cur->src[0], "l%d.ln_2_g", il);
- }
-
- // fully connected
- // [3072, 768] - model.layers[il].c_mlp_fc_w
- // [3072, 1] - model.layers[il].c_mlp_fc_b
- // [ 768, N] - cur (in)
- // [3072, N] - cur (out)
- //
- // cur = fc_w*cur + fc_b
- // [3072, N]
- cur = ggml_mul_mat(ctx0,
- model.layers[il].c_mlp_fc_w,
- cur);
- ggml_format_name(cur, "l%d.mlp_fc_w", il);
-
- cur = ggml_add(ctx0,
- cur,
- model.layers[il].c_mlp_fc_b);
- ggml_format_name(cur, "l%d.mlp_fc_b", il);
-
- // GELU activation
- // [3072, N]
- cur = ggml_gelu(ctx0, cur);
- ggml_format_name(cur, "l%d.gelu", il);
-
- // projection
- // [ 768, 3072] - model.layers[il].c_mlp_proj_w
- // [ 768, 1] - model.layers[il].c_mlp_proj_b
- // [3072, N] - cur (in)
- // [ 768, N] - cur (out)
- //
- // cur = proj_w*cur + proj_b
- // [768, N]
- cur = ggml_mul_mat(ctx0,
- model.layers[il].c_mlp_proj_w,
- cur);
- ggml_format_name(cur, "l%d.mlp_proj_w", il);
-
- cur = ggml_add(ctx0,
- cur,
- model.layers[il].c_mlp_proj_b);
- ggml_format_name(cur, "l%d.mlp_proj_b", il);
- }
-
- // input for next layer
- inpL = ggml_add(ctx0, cur, inpFF);
- ggml_format_name(inpL, "l%d.add2", il);
- }
-
- // norm
- {
- // [ 768, N]
- inpL = ggml_norm(ctx0, inpL, hparams.eps);
- ggml_format_name(inpL, "out_norm");
-
- // inpL = ln_f_g*inpL + ln_f_b
- // [ 768, N]
- inpL = ggml_add(ctx0,
- ggml_mul(ctx0,
- inpL,
- model.ln_f_g),
- model.ln_f_b);
- ggml_format_name(inpL, "out_ln_f_b");
- ggml_format_name(inpL->src[0], "out_ln_f_g");
- }
-
- // inpL = WTE * inpL
- // [ 768, 50257] - model.lm_head
- // [ 768, N] - inpL
- inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
- ggml_format_name(inpL, "out_lm_head");
-
- // logits -> probs
- //inpL = ggml_soft_max(ctx0, inpL);
-
- ggml_build_forward_expand(gf, inpL);
-
- ggml_free(ctx0);
-
- return gf;
-}
-
-// evaluate the transformer
-//
-// - model: the model
-// - allocr: ggml_allocr to use to allocate the compute buffer
-// - n_threads: number of threads to use
-// - n_past: the context size so far
-// - embd_inp: the embeddings of the tokens in the context
-// - embd_w: the predicted logits for the next token
-//
-bool gpt2_eval(
- const gpt2_model & model,
- ggml_backend_sched_t sched,
- const int n_past,
- const std::vector<gpt_vocab::id> & embd_inp,
- std::vector<float> & embd_w) {
- const int N = embd_inp.size();
-
- const auto & hparams = model.hparams;
-
- const int n_vocab = hparams.n_vocab;
-
- struct ggml_cgraph * gf = gpt2_graph(model, n_past, embd_inp);
-
- // run the computation
- ggml_backend_sched_graph_compute(sched, gf);
-
- //if (n_past%100 == 0) {
- // ggml_graph_print (&gf);
- // ggml_graph_dump_dot(&gf, NULL, "gpt-2.dot");
- //}
-
- // in this case, the output tensor is the last one in the graph
- struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
-
- //embd_w.resize(n_vocab*N);
- //ggml_backend_tensor_get(inpL, embd_w.data(), 0, sizeof(float)*n_vocab*N);
-
- // return result just for the last token
- embd_w.resize(n_vocab);
- ggml_backend_tensor_get(inpL, embd_w.data(), (n_vocab*(N-1))*sizeof(float), sizeof(float)*n_vocab);
-
- return true;
-}
-
-int main(int argc, char ** argv) {
- ggml_time_init();
-
- const int64_t t_main_start_us = ggml_time_us();
-
- gpt_params params;
- params.model = "models/gpt-2-117M/ggml-model.bin";
-
- if (gpt_params_parse(argc, argv, params) == false) {
- return 1;
- }
-
- if (params.seed < 0) {
- params.seed = time(NULL);
- }
-
- printf("%s: seed = %d\n", __func__, params.seed);
-
- std::mt19937 rng(params.seed);
- if (params.prompt.empty()) {
- params.prompt = gpt_random_prompt(rng);
- }
-
- int64_t t_load_us = 0;
-
- gpt_vocab vocab;
- gpt2_model model;
-
- // load the model
- {
- const int64_t t_start_us = ggml_time_us();
-
- if (!gpt2_model_load(params.model, model, vocab, params)) {
- fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
- return 1;
- }
-
- t_load_us = ggml_time_us() - t_start_us;
-
- test_gpt_tokenizer(vocab, params.token_test);
- }
-
- // create the backend scheduler
- // the scheduler handles the allocation of the compute buffers and the scheduling of the computation between the different backends
- ggml_backend_sched_t sched;
- {
- // initialize the scheduler
- sched = ggml_backend_sched_new(model.backends.data(), NULL, model.backends.size(), GPT2_MAX_NODES);
-
- // create the worst case graph for memory usage estimation
- int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
- int n_past = model.hparams.n_ctx - n_tokens;
- struct ggml_cgraph * gf = gpt2_graph(model, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
-
- ggml_backend_sched_init_measure(sched, gf);
-
-
- // compute the required memory
- size_t mem_size = 0;
- for (size_t i = 0; i < model.backends.size(); i++) {
- ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(sched, model.backends[i]);
- size_t size = ggml_backend_buffer_get_size(buf);
- if (size > 0) {
- mem_size += size;
- printf("%s: %8s compute buffer size = %8.2f MB\n", __func__, ggml_backend_name(model.backends[i]), size/1024.0/1024.0);
- //printf("%s: %8s compute buffer size = %zu bytes\n", __func__, ggml_backend_name(model.backends[i]), size);
- }
- }
-
- printf("%s: total compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
- }
-
- int n_past = 0;
-
- int64_t t_sample_us = 0;
- int64_t t_predict_us = 0;
-
- std::vector<float> logits;
-
- // tokenize the prompt
- std::vector<gpt_vocab::id> embd_inp = ::gpt_tokenize(vocab, params.prompt);
-
- params.n_predict = std::min(params.n_predict, model.hparams.n_ctx - (int) embd_inp.size());
-
- printf("%s: prompt: '%s'\n", __func__, params.prompt.c_str());
- printf("%s: number of tokens in prompt = %zu, first 8 tokens: ", __func__, embd_inp.size());
- for (int i = 0; i < std::min(8, (int) embd_inp.size()); i++) {
- printf("%d ", embd_inp[i]);
- }
- printf("\n\n");
-
- // submit the input prompt token-by-token
- // this reduces the memory usage during inference, at the cost of a bit of speed at the beginning
- std::vector<gpt_vocab::id> embd;
-
- for (size_t i = embd.size(); i < embd_inp.size() + params.n_predict; i++) {
- // predict
- if (embd.size() > 0) {
- const int64_t t_start_us = ggml_time_us();
-
- if (!gpt2_eval(model, sched, n_past, embd, logits)) {
- printf("Failed to predict\n");
- return 1;
- }
-
- t_predict_us += ggml_time_us() - t_start_us;
- }
-
- n_past += embd.size();
- embd.clear();
-
- if (i >= embd_inp.size()) {
- // sample next token
- const int top_k = params.top_k;
- const float top_p = params.top_p;
- const float temp = params.temp;
-
- const int n_vocab = model.hparams.n_vocab;
-
- gpt_vocab::id id = 0;
-
- {
- const int64_t t_start_sample_us = ggml_time_us();
-
- id = gpt_sample_top_k_top_p(vocab, logits.data() + (logits.size() - n_vocab), top_k, top_p, temp, rng);
-
- t_sample_us += ggml_time_us() - t_start_sample_us;
- }
-
- // add it to the context
- embd.push_back(id);
- } else {
- // if here, it means we are still processing the input prompt
- for (size_t k = i; k < embd_inp.size(); k++) {
- embd.push_back(embd_inp[k]);
- if (int32_t(embd.size()) >= params.n_batch) {
- break;
- }
- }
- i += embd.size() - 1;
- }
-
- // display text
- for (auto id : embd) {
- printf("%s", vocab.id_to_token[id].c_str());
- }
- fflush(stdout);
-
- // end of text token
- if (embd.back() == 50256) {
- break;
- }
- }
-
- // report timing
- {
- const int64_t t_main_end_us = ggml_time_us();
-
- printf("\n\n");
- printf("%s: load time = %8.2f ms\n", __func__, t_load_us/1000.0f);
- printf("%s: sample time = %8.2f ms\n", __func__, t_sample_us/1000.0f);
- printf("%s: predict time = %8.2f ms / %.2f ms per token\n", __func__, t_predict_us/1000.0f, t_predict_us/1000.0f/n_past);
- printf("%s: total time = %8.2f ms\n", __func__, (t_main_end_us - t_main_start_us)/1000.0f);
- }
-
- ggml_free(model.ctx);
-
- ggml_backend_sched_free(sched);
- ggml_backend_buffer_free(model.buffer_kv);
- for (auto & buf : model.buffers_w) {
- ggml_backend_buffer_free(buf);
- }
- for (auto backend : model.backends) {
- ggml_backend_free(backend);
- }
-
- return 0;
-}
#include "ggml.h"
#include "ggml-alloc.h"
+#include "ggml-backend.h"
#define STB_IMAGE_IMPLEMENTATION
#include "stb_image.h"
#define STB_IMAGE_WRITE_IMPLEMENTATION
// buffer for `ggml_graph_plan.work_data`
std::vector<uint8_t> work_buffer;
// buffers to evaluate the model
- std::vector<uint8_t> buf_alloc_img_enc;
std::vector<uint8_t> buf_compute_img_enc;
- std::vector<uint8_t> buf_alloc_fast;
std::vector<uint8_t> buf_compute_fast;
- struct ggml_allocr * allocr = {};
+ ggml_gallocr_t allocr = {};
};
// void save_tensor(sam_state& state, struct ggml_tensor * t, struct ggml_cgraph * gf) {
const auto & hparams = model.hparams;
const auto & enc = model.enc_prompt;
- const int32_t n_img_embd = hparams.n_img_embd();
- const float n_img_embd_inv = 1.0f / n_img_embd;
+ const int32_t n_img_embd = hparams.n_img_embd();
struct ggml_tensor * xy_embed_stacked = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, 2, n_img_embd, n_img_embd);
- ggml_allocr_alloc(state.allocr, xy_embed_stacked);
-
- if (!ggml_allocr_is_measure(state.allocr)) {
- float * data = (float *) ggml_get_data(xy_embed_stacked);
- for (int i = 0; i < n_img_embd; ++i) {
- const int row = 2*i*n_img_embd;
- const float y_val = 2 * (i + 0.5f) * n_img_embd_inv - 1;
- for (int j = 0; j < n_img_embd; ++j) {
- const float x_val = 2 * (j + 0.5f) * n_img_embd_inv - 1;
- data[row + 2*j + 0] = x_val;
- data[row + 2*j + 1] = y_val;
- }
- }
- }
+ ggml_set_name(xy_embed_stacked, "xy_embed_stacked");
+ ggml_set_input(xy_embed_stacked);
struct ggml_tensor * cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, enc.pe)), xy_embed_stacked);
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
struct ggml_tensor * inp = ggml_new_tensor_4d(ctx0, GGML_TYPE_F32, n_img_size, n_img_size, 3, 1);
- ggml_allocr_alloc(state.allocr, inp);
- if (!ggml_allocr_is_measure(state.allocr)) {
- float * data = (float *) ggml_get_data(inp);
-
- const int nx = img.nx;
- const int ny = img.ny;
- const int n = nx*ny;
-
- GGML_ASSERT(nx == n_img_size && ny == n_img_size);
-
- for (int k = 0; k < 3; k++) {
- for (int y = 0; y < ny; y++) {
- for (int x = 0; x < nx; x++) {
- data[k*n + y*nx + x] = img.data[3*(y*nx + x) + k];
- }
- }
- }
- }
+ ggml_set_name(inp, "inp");
+ ggml_set_input(inp);
// ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/image_encoder.py#L392
struct ggml_tensor * cur = ggml_conv_2d_sk_p0(ctx0, enc.proj_w, inp);
ggml_free(ctx0);
+ ggml_gallocr_alloc_graph(state.allocr, gf);
+
+ {
+ struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "inp");
+ float * data = (float *) ggml_get_data(inp);
+
+ const int nx = img.nx;
+ const int ny = img.ny;
+ const int n = nx*ny;
+
+ GGML_ASSERT(nx == n_img_size && ny == n_img_size);
+
+ for (int k = 0; k < 3; k++) {
+ for (int y = 0; y < ny; y++) {
+ for (int x = 0; x < nx; x++) {
+ data[k*n + y*nx + x] = img.data[3*(y*nx + x) + k];
+ }
+ }
+ }
+ }
+
return gf;
}
const sam_model & model,
struct ggml_context * ctx0,
struct ggml_cgraph * gf,
- sam_state & state,
- int nx,
- int ny,
- sam_point point) {
+ sam_state & state) {
const auto & hparams = model.hparams;
const auto & enc = model.enc_prompt;
- // transform points
- // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/automatic_mask_generator.py#L276
- {
- const int nmax = std::max(nx, ny);
-
- const float scale = hparams.n_img_size() / (float) nmax;
-
- const int nx_new = int(nx*scale + 0.5f);
- const int ny_new = int(ny*scale + 0.5f);
-
- point.x = point.x*(float(nx_new)/nx) + 0.5f;
- point.y = point.y*(float(ny_new)/ny) + 0.5f;
- }
-
struct ggml_tensor * inp = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2, 2);
+ ggml_set_name(inp, "prompt_input");
+ ggml_set_input(inp);
- ggml_allocr_alloc(state.allocr, inp);
- if (!ggml_allocr_is_measure(state.allocr)) {
- // set the input by converting the [0, 1] coordinates to [-1, 1]
- float * data = (float *) inp->data;
-
- data[0] = 2.0f*(point.x / hparams.n_img_size()) - 1.0f;
- data[1] = 2.0f*(point.y / hparams.n_img_size()) - 1.0f;
-
- // padding
- // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L81-L85
- data[2] = 2.0f*(0.0f) - 1.0f;
- data[3] = 2.0f*(0.0f) - 1.0f;
- }
struct ggml_tensor * cur = ggml_mul_mat(ctx0, ggml_cont(ctx0, ggml_transpose(ctx0, enc.pe)), inp);
{
// ConvTranspose2d
keys = ggml_conv_transpose_2d_p0(ctx0, dec.output_upscaling_0_w, keys, 2);
- ggml_allocr_alloc(state.allocr, keys); // TODO: This alloc shouldn't be needed
keys = ggml_add_inplace(ctx0, keys, ggml_repeat(ctx0,
ggml_reshape_3d(ctx0, dec.output_upscaling_0_b, 1, 1, dec.output_upscaling_0_b->ne[0]),
keys));
// ConvTranspose2d
keys = ggml_conv_transpose_2d_p0(ctx0, dec.output_upscaling_3_w, keys, 2);
- ggml_allocr_alloc(state.allocr, keys); // TODO: This alloc shouldn't be needed
keys = ggml_add_inplace(ctx0, ggml_repeat(ctx0,
ggml_reshape_3d(ctx0, dec.output_upscaling_3_b, 1, 1, dec.output_upscaling_3_b->ne[0]),
keys), keys);
struct ggml_context * ctx0 = ggml_init(ggml_params);
struct ggml_cgraph * gf = ggml_new_graph(ctx0);
- prompt_encoder_result enc_res = sam_encode_prompt(model, ctx0, gf, state, nx, ny, point);
+ prompt_encoder_result enc_res = sam_encode_prompt(model, ctx0, gf, state);
if (!enc_res.embd_prompt_sparse || !enc_res.embd_prompt_dense) {
fprintf(stderr, "%s: failed to encode prompt (%f, %f)\n", __func__, point.x, point.y);
return {};
ggml_free(ctx0);
+ ggml_gallocr_alloc_graph(state.allocr, gf);
+
+ // from sam_encode_prompt
+ {
+ // transform points
+ // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/automatic_mask_generator.py#L276
+ {
+ const int nmax = std::max(nx, ny);
+
+ const float scale = model.hparams.n_img_size() / (float) nmax;
+
+ const int nx_new = int(nx*scale + 0.5f);
+ const int ny_new = int(ny*scale + 0.5f);
+
+ point.x = point.x*(float(nx_new)/nx) + 0.5f;
+ point.y = point.y*(float(ny_new)/ny) + 0.5f;
+ }
+
+ struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "prompt_input");
+ // set the input by converting the [0, 1] coordinates to [-1, 1]
+ float * data = (float *) inp->data;
+
+ data[0] = 2.0f*(point.x / model.hparams.n_img_size()) - 1.0f;
+ data[1] = 2.0f*(point.y / model.hparams.n_img_size()) - 1.0f;
+
+ // padding
+ // ref: https://github.com/facebookresearch/segment-anything/blob/main/segment_anything/modeling/prompt_encoder.py#L81-L85
+ data[2] = 2.0f*(0.0f) - 1.0f;
+ data[3] = 2.0f*(0.0f) - 1.0f;
+ }
+
+ // from sam_fill_dense_pe
+ {
+ struct ggml_tensor * xy_embed_stacked = ggml_graph_get_tensor(gf, "xy_embed_stacked");
+ const int32_t n_img_embd = model.hparams.n_img_embd();
+ const float n_img_embd_inv = 1.0f / n_img_embd;
+ float * data = (float *) ggml_get_data(xy_embed_stacked);
+ for (int i = 0; i < n_img_embd; ++i) {
+ const int row = 2*i*n_img_embd;
+ const float y_val = 2 * (i + 0.5f) * n_img_embd_inv - 1;
+ for (int j = 0; j < n_img_embd; ++j) {
+ const float x_val = 2 * (j + 0.5f) * n_img_embd_inv - 1;
+ data[row + 2*j + 0] = x_val;
+ data[row + 2*j + 1] = y_val;
+ }
+ }
+ }
+
return gf;
}
}
- static const size_t tensor_alignment = 32;
{
state.buf_compute_img_enc.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead());
- state.allocr = ggml_allocr_new_measure(tensor_alignment);
- struct ggml_cgraph * gf_measure = sam_encode_image(model, state, img1);
- if (!gf_measure) {
- fprintf(stderr, "%s: failed to encode image\n", __func__);
- return 1;
- }
-
- size_t alloc_size = ggml_allocr_alloc_graph(state.allocr, gf_measure) + tensor_alignment;
- ggml_allocr_free(state.allocr);
-
- // recreate allocator with exact memory requirements
- state.buf_alloc_img_enc.resize(alloc_size);
- state.allocr = ggml_allocr_new(state.buf_alloc_img_enc.data(), state.buf_alloc_img_enc.size(), tensor_alignment);
-
- // compute the graph with the measured exact memory requirements from above
- ggml_allocr_reset(state.allocr);
+ state.allocr = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
struct ggml_cgraph * gf = sam_encode_image(model, state, img1);
if (!gf) {
return 1;
}
- ggml_allocr_alloc_graph(state.allocr, gf);
-
ggml_graph_compute_helper(state.work_buffer, gf, params.n_threads);
print_t_f32("embd_img", state.embd_img);
- ggml_allocr_free(state.allocr);
+ ggml_gallocr_free(state.allocr);
state.allocr = NULL;
state.work_buffer.clear();
}
{
state.buf_compute_fast.resize(ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead());
- state.allocr = ggml_allocr_new_measure(tensor_alignment);
+ state.allocr = ggml_gallocr_new(ggml_backend_cpu_buffer_type());
// TODO: more varied prompts
fprintf(stderr, "prompt: (%f, %f)\n", params.pt.x, params.pt.y);
- // measure memory requirements for the graph
- struct ggml_cgraph * gf_measure = sam_build_fast_graph(model, state, img0.nx, img0.ny, params.pt);
- if (!gf_measure) {
- fprintf(stderr, "%s: failed to build fast graph to measure\n", __func__);
- return 1;
- }
-
- size_t alloc_size = ggml_allocr_alloc_graph(state.allocr, gf_measure) + tensor_alignment;
- ggml_allocr_free(state.allocr);
-
- // recreate allocator with exact memory requirements
- state.buf_alloc_fast.resize(alloc_size);
- state.allocr = ggml_allocr_new(state.buf_alloc_fast.data(), state.buf_alloc_fast.size(), tensor_alignment);
-
- // compute the graph with the measured exact memory requirements from above
- ggml_allocr_reset(state.allocr);
-
struct ggml_cgraph * gf = sam_build_fast_graph(model, state, img0.nx, img0.ny, params.pt);
if (!gf) {
fprintf(stderr, "%s: failed to build fast graph\n", __func__);
return 1;
}
- ggml_allocr_alloc_graph(state.allocr, gf);
-
ggml_graph_compute_helper(state.work_buffer, gf, params.n_threads);
//print_t_f32("iou_predictions", state.iou_predictions);
//print_t_f32("low_res_masks", state.low_res_masks);
- ggml_allocr_free(state.allocr);
+ ggml_gallocr_free(state.allocr);
state.allocr = NULL;
}
// ggml_allocr wrapper for whisper usage
struct whisper_allocr {
- ggml_allocr * alloc = nullptr;
+ ggml_gallocr_t alloc = nullptr;
std::vector<uint8_t> meta;
-
- ggml_backend_buffer_t buffer;
};
static size_t whisper_allocr_size(struct whisper_allocr & allocr) {
- return allocr.meta.size() + ggml_allocr_max_size(allocr.alloc);
+ return allocr.meta.size() + ggml_gallocr_get_buffer_size(allocr.alloc, 0);
}
// measure the memory usage of a graph and prepare the allocr's internal data buffer
-static void whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function<struct ggml_cgraph *()> && get_graph) {
+static bool whisper_allocr_graph_init(struct whisper_allocr & allocr, ggml_backend_t backend, std::function<struct ggml_cgraph *()> && get_graph) {
auto & alloc = allocr.alloc;
auto & meta = allocr.meta;
- alloc = ggml_allocr_new_measure_from_backend(backend);
+ alloc = ggml_gallocr_new(ggml_backend_get_default_buffer_type(backend));
meta.resize(ggml_tensor_overhead()*WHISPER_MAX_NODES + ggml_graph_overhead());
- ggml_allocr_alloc_graph(alloc, get_graph());
-}
-
-static void whisper_allocr_graph_realloc(struct whisper_allocr & allocr, ggml_backend_t backend) {
- if (allocr.alloc == nullptr) {
- // this can be null if we use external encoder like CoreML or OpenVINO
- return;
- }
-
- auto & alloc = allocr.alloc;
- auto & buffer = allocr.buffer;
-
- size_t size = ggml_allocr_max_size(alloc);
-
- ggml_allocr_free(alloc);
-
- buffer = ggml_backend_alloc_buffer(backend, size);
- alloc = ggml_allocr_new_from_buffer(buffer);
-}
-
-static void whisper_allocr_free(struct whisper_allocr & allocr) {
- if (allocr.alloc) {
- ggml_allocr_free(allocr.alloc);
- ggml_backend_buffer_free(allocr.buffer);
- allocr.alloc = nullptr;
+ // since there are dependencies between the different graphs,
+ // we need to allocate them instead of only reserving to get the correct compute buffer size
+ if (!ggml_gallocr_alloc_graph(alloc, get_graph())) {
+ // failed to allocate the compute buffer
+ WHISPER_LOG_ERROR("%s: failed to allocate the compute buffer\n", __func__);
+ return false;
}
+ return true;
}
// medium
struct ggml_tensor * k;
struct ggml_tensor * v;
- struct ggml_context * ctx;
+ struct ggml_context * ctx = nullptr;
- ggml_backend_buffer_t buffer;
+ ggml_backend_buffer_t buffer = nullptr;
};
struct whisper_model {
std::vector<whisper_layer_decoder> layers_decoder;
// ggml context that contains all the meta information about the model tensors
- struct ggml_context * ctx;
+ struct ggml_context * ctx = nullptr;
// the model backend data is read-only and can be shared between processors
- std::vector<struct ggml_backend_buffer *> buffers;
+ ggml_backend_buffer_t buffer = nullptr;
// tensors
int n_loaded;
cache.ctx = ggml_init(params);
if (!cache.ctx) {
- WHISPER_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
+ WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache context\n", __func__);
return false;
}
cache.k = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
cache.v = ggml_new_tensor_1d(cache.ctx, wtype, n_elements);
- const size_t mem_bytes = ggml_nbytes(cache.k) + ggml_nbytes(cache.v);
-
- cache.buffer = ggml_backend_alloc_buffer(backend, mem_bytes);
-
- // allocate the tensors into the backend buffer
- {
- ggml_allocr * alloc = ggml_allocr_new_from_buffer(cache.buffer);
-
- ggml_allocr_alloc(alloc, cache.k);
- ggml_allocr_alloc(alloc, cache.v);
-
- ggml_allocr_free(alloc);
+ cache.buffer = ggml_backend_alloc_ctx_tensors(cache.ctx, backend);
+ if (!cache.buffer) {
+ WHISPER_LOG_ERROR("%s: failed to allocate memory for the kv cache\n", __func__);
+ return false;
}
return true;
}
static void kv_cache_free(struct whisper_kv_cache & cache) {
- if (cache.ctx) {
- ggml_free(cache.ctx);
- ggml_backend_buffer_free(cache.buffer);
- cache.ctx = nullptr;
- }
+ ggml_free(cache.ctx);
+ ggml_backend_buffer_free(cache.buffer);
+ cache.ctx = nullptr;
}
static bool whisper_kv_cache_find_slot(
}
wctx.backend = whisper_backend_init(wctx.params);
-
- // some devices have a limit on the maximum size of single memory buffer
- // for example, iPhones are limited to 1GB per buffer
- // to workaround this, we will allocate multiple buffers of smaller size and will split the tensors with the
- // model weights between them
- //
- // the map_t2b maps tensor names to buffer indices
- // as we iterate over the tensors, we will allocate new buffers when the current one is full
- //
- // finally, we create a separate allocator for each buffer and use it to allocate the tensors
- // we keep the allocators alive until all the tensors are loaded
-
- GGML_ASSERT(model.buffers.empty());
-
- std::map<std::string, int> map_t2b;
-
- {
- size_t size_main = 0;
- size_t size_cur = 0;
-
- static const size_t GB = 1024ull*1024ull*1024ull;
-
- for (const auto & t : model.tensors) {
- const size_t cur = ggml_nbytes(t.second) + ggml_tensor_overhead();
-
- // adding the tensor to the current buffer will exceed the limit, so we need to allocate a new buffer
- if (size_cur + cur > GB) {
- GGML_ASSERT(size_cur > 0 && "A tensor is too large to fit in a single buffer");
-
- model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur));
-
- size_cur = cur;
- }
-
- map_t2b[t.first] = model.buffers.size();
-
- size_cur += cur;
- size_main += cur;
- }
-
- // allocate the last buffer if needed
- if (size_cur > 0) {
- model.buffers.emplace_back(ggml_backend_alloc_buffer(wctx.backend, size_cur));
- }
-
- GGML_ASSERT(model.buffers.size() > 0);
-
- WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB (%d buffers)\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6, (int) model.buffers.size());
- }
-
- std::vector<ggml_allocr *> allocs(model.buffers.size());
- for (size_t i = 0; i < allocs.size(); ++i) {
- allocs[i] = ggml_allocr_new_from_buffer(model.buffers[i]);
+ if (!wctx.backend) {
+ WHISPER_LOG_ERROR("%s: failed to initialize the backend\n", __func__);
+ return false;
}
// allocate tensors in the backend buffers
- {
- for (const auto & t : model.tensors) {
- ggml_allocr_alloc(allocs[map_t2b[t.first]], t.second);
- }
+ model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, wctx.backend);
+ if (!model.buffer) {
+ WHISPER_LOG_ERROR("%s: failed to allocate memory for the model\n", __func__);
+ return false;
}
+ size_t size_main = ggml_backend_buffer_get_size(model.buffer);
+ WHISPER_LOG_INFO("%s: %8s total size = %8.2f MB\n", __func__, ggml_backend_name(wctx.backend), size_main / 1e6);
+
// load weights
{
size_t total_size = 0;
return false;
}
- ggml_backend_t backend = wctx.backend;
+ //ggml_backend_t backend = wctx.backend;
//printf("%s: [%5.5s] %s\n", __func__, ggml_backend_name(backend), name.c_str());
- if ((ggml_backend_is_cpu(backend)
-#ifdef GGML_USE_METAL
- || ggml_backend_is_metal(backend)
-#endif
- )) {
+ if (ggml_backend_buffer_is_host(model.buffer)) {
// for the CPU and Metal backend, we can read directly into the tensor
loader->read(loader->context, tensor->data, ggml_nbytes(tensor));
BYTESWAP_TENSOR(tensor);
}
}
- for (auto & alloc : allocs) {
- ggml_allocr_free(alloc);
- }
-
wctx.t_load_us = ggml_time_us() - t_start_us;
return true;
whisper_state & wstate,
const int mel_offset) {
const auto & model = wctx.model;
- const auto & mel_inp = wstate.mel;
const auto & hparams = model.hparams;
const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
ggml_cgraph * gf = ggml_new_graph(ctx0);
- ggml_allocr * alloc = wstate.alloc_conv.alloc;
-
struct ggml_tensor * mel = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, 2*n_ctx, n_mels);
- ggml_allocr_alloc(alloc, mel);
-
- assert(mel->type == GGML_TYPE_F32);
- if (!ggml_allocr_is_measure(alloc)) {
- assert(mel_inp.n_mel == n_mels);
-
- wstate.inp_mel.resize(ggml_nelements(mel));
-
- float * dst = wstate.inp_mel.data();
- memset(dst, 0, ggml_nbytes(mel));
-
- const int i0 = std::min(mel_offset, mel_inp.n_len);
- const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
-
- for (int j = 0; j < mel_inp.n_mel; ++j) {
- for (int i = i0; i < i1; ++i) {
- dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
- }
- }
-
- ggml_backend_tensor_set(mel, wstate.inp_mel.data(), 0, ggml_nelements(mel)*sizeof(float));
- }
+ ggml_set_name(mel, "mel");
+ ggml_set_input(mel);
struct ggml_tensor * cur = nullptr;
{
auto & alloc = wstate.alloc_conv.alloc;
- ggml_allocr_reset(alloc);
-
ggml_cgraph * gf = whisper_build_graph_conv(wctx, wstate, mel_offset);
- ggml_allocr_alloc_graph(alloc, gf);
+ if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+ // should never happen as we pre-allocate the memory
+ return false;
+ }
+
+ // set the input
+ {
+ const auto & mel_inp = wstate.mel;
+ const int n_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : wctx.model.hparams.n_audio_ctx;
+
+ struct ggml_tensor * mel = ggml_graph_get_tensor(gf, "mel");
+
+ assert(mel->type == GGML_TYPE_F32);
+ assert(mel_inp.n_mel == wctx.model.hparams.n_mels);
+
+ wstate.inp_mel.resize(ggml_nelements(mel));
+
+ float * dst = wstate.inp_mel.data();
+ memset(dst, 0, ggml_nbytes(mel));
+
+ const int i0 = std::min(mel_offset, mel_inp.n_len);
+ const int i1 = std::min(mel_offset + 2*n_ctx, mel_inp.n_len);
+
+ for (int j = 0; j < mel_inp.n_mel; ++j) {
+ for (int i = i0; i < i1; ++i) {
+ dst[j*2*n_ctx + (i - i0)] = mel_inp.data[j*mel_inp.n_len + i];
+ }
+ }
+
+ ggml_backend_tensor_set(mel, wstate.inp_mel.data(), 0, ggml_nelements(mel)*sizeof(float));
+ }
if (!whisper_encode_external(wstate)) {
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
if (!whisper_encode_external(wstate)) {
auto & alloc = wstate.alloc_encode.alloc;
- ggml_allocr_reset(alloc);
-
ggml_cgraph * gf = whisper_build_graph_encoder(wctx, wstate);
- ggml_allocr_alloc_graph(alloc, gf);
+ if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+ // should never happen as we pre-allocate the memory
+ return false;
+ }
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
return false;
{
auto & alloc = wstate.alloc_cross.alloc;
- ggml_allocr_reset(alloc);
-
ggml_cgraph * gf = whisper_build_graph_cross(wctx, wstate);
- ggml_allocr_alloc_graph(alloc, gf);
+ if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+ // should never happen as we pre-allocate the memory
+ return false;
+ }
if (!ggml_graph_compute_helper(wstate.backend, gf, n_threads)) {
return false;
static struct ggml_cgraph * whisper_build_graph_decoder(
whisper_context & wctx,
whisper_state & wstate,
- const whisper_batch & batch) {
+ const whisper_batch & batch,
+ bool worst_case) {
const auto & model = wctx.model;
const auto & hparams = model.hparams;
WHISPER_ASSERT(!!kv_self.ctx);
- ggml_allocr * alloc = wstate.alloc_decode.alloc;
-
const int n_ctx = kv_self.size;
const int n_state = hparams.n_text_state;
const int n_head = hparams.n_text_head;
const int n_tokens = batch.n_tokens;
const int n_audio_ctx = wstate.exp_n_audio_ctx > 0 ? wstate.exp_n_audio_ctx : hparams.n_audio_ctx;
- const int32_t n_kv = ggml_allocr_is_measure(alloc) ? n_ctx : kv_self.n;
- const int32_t kv_head = ggml_allocr_is_measure(alloc) ? n_ctx - n_tokens : kv_self.head;
+ const int32_t n_kv = worst_case ? n_ctx : kv_self.n;
+ const int32_t kv_head = worst_case ? n_ctx - n_tokens : kv_self.head;
//WHISPER_LOG_DEBUG("%s: n_past = %d, n_tokens = %d, n_audio_ctx = %d, n_ctx = %d\n", __func__, n_past, n_tokens, n_audio_ctx, n_ctx);
ggml_cgraph * gf = ggml_new_graph_custom(ctx0, WHISPER_MAX_NODES, false);
struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
- ggml_allocr_alloc(alloc, embd);
-
- if (!ggml_allocr_is_measure(alloc)) {
- ggml_backend_tensor_set(embd, batch.token, 0, n_tokens*ggml_element_size(embd));
- }
+ ggml_set_name(embd, "embd");
+ ggml_set_input(embd);
struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
- ggml_allocr_alloc(alloc, position);
-
- if (!ggml_allocr_is_measure(alloc)) {
- for (int i = 0; i < n_tokens; ++i) {
- const int32_t val = batch.pos[i];
- ggml_backend_tensor_set(position, &val, i*sizeof(int32_t), sizeof(int32_t));
- }
- }
+ ggml_set_name(position, "position");
+ ggml_set_input(position);
const float KQscale = pow(float(n_state)/n_head, -0.25);
struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
- ggml_allocr_alloc(alloc, KQ_mask);
-
- if (!ggml_allocr_is_measure(alloc)) {
- wstate.inp_mask.resize(n_kv*n_tokens);
-
- float * data = wstate.inp_mask.data();
- memset(data, 0, ggml_nbytes(KQ_mask));
-
- for (int h = 0; h < 1; ++h) {
- for (int j = 0; j < n_tokens; ++j) {
- const whisper_pos pos = batch.pos[j];
- const whisper_seq_id seq_id = batch.seq_id[j][0];
-
- for (int i = 0; i < n_kv; ++i) {
- if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
- data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
- }
- }
- }
- }
-
- ggml_backend_tensor_set(KQ_mask, wstate.inp_mask.data(), 0, ggml_nelements(KQ_mask)*sizeof(float));
- }
+ ggml_set_name(KQ_mask, "KQ_mask");
+ ggml_set_input(KQ_mask);
// token encoding + position encoding
struct ggml_tensor * cur =
{
auto & alloc = wstate.alloc_decode.alloc;
- ggml_allocr_reset(alloc);
+ ggml_cgraph * gf = whisper_build_graph_decoder(wctx, wstate, batch, false);
- ggml_cgraph * gf = whisper_build_graph_decoder(wctx, wstate, batch);
+ if (!ggml_gallocr_alloc_graph(alloc, gf)) {
+ // should never happen as we pre-allocate the memory
+ return false;
+ }
- ggml_allocr_alloc_graph(alloc, gf);
+ // set the inputs
+ {
+ struct ggml_tensor * embd = ggml_graph_get_tensor(gf, "embd");
+ ggml_backend_tensor_set(embd, batch.token, 0, n_tokens*ggml_element_size(embd));
+ }
+
+ {
+ struct ggml_tensor * position = ggml_graph_get_tensor(gf, "position");
+ for (int i = 0; i < n_tokens; ++i) {
+ const int32_t val = batch.pos[i];
+ ggml_backend_tensor_set(position, &val, i*sizeof(int32_t), sizeof(int32_t));
+ }
+ }
+
+ {
+ struct ggml_tensor * KQ_mask = ggml_graph_get_tensor(gf, "KQ_mask");
+
+ auto & kv_self = wstate.kv_self;
+ const int32_t n_kv = kv_self.n;
+
+ wstate.inp_mask.resize(n_kv*n_tokens);
+
+ float * data = wstate.inp_mask.data();
+ memset(data, 0, ggml_nbytes(KQ_mask));
+
+ for (int h = 0; h < 1; ++h) {
+ for (int j = 0; j < n_tokens; ++j) {
+ const whisper_pos pos = batch.pos[j];
+ const whisper_seq_id seq_id = batch.seq_id[j][0];
+
+ for (int i = 0; i < n_kv; ++i) {
+ if (!kv_self.cells[i].has_seq_id(seq_id) || kv_self.cells[i].pos > pos) {
+ data[h*(n_kv*n_tokens) + j*n_kv + i] = -INFINITY;
+ }
+ }
+ }
+ }
+
+ ggml_backend_tensor_set(KQ_mask, wstate.inp_mask.data(), 0, ggml_nelements(KQ_mask)*sizeof(float));
+ }
logits = gf->nodes[gf->n_nodes - 1];
whisper_state * state = new whisper_state;
state->backend = whisper_backend_init(ctx->params);
+ if (!state->backend) {
+ WHISPER_LOG_ERROR("%s: whisper_backend_init() failed\n", __func__);
+ whisper_free_state(state);
+ return nullptr;
+ }
// at this point, we don't know yet how many decoders will be used, so we overallocate 3x ctx
// in theory, there can be a case where this is not enough, but in practice it should always be enough
if (!kv_cache_init(ctx->model.hparams, state->kv_self, ctx->backend, ctx->itype, factor*ctx->model.hparams.n_text_ctx)) {
WHISPER_LOG_ERROR("%s: kv_cache_init() failed for self-attention cache\n", __func__);
- delete state;
+ whisper_free_state(state);
return nullptr;
}
if (!kv_cache_init(ctx->model.hparams, state->kv_cross, ctx->backend, ctx->itype, ctx->model.hparams.n_audio_ctx)) {
WHISPER_LOG_ERROR("%s: kv_cache_init() failed for cross-attention cache\n", __func__);
- delete state;
+ whisper_free_state(state);
return nullptr;
}
if (!state->ctx_coreml) {
WHISPER_LOG_ERROR("%s: failed to load Core ML model from '%s'\n", __func__, path_coreml.c_str());
#ifndef WHISPER_COREML_ALLOW_FALLBACK
- delete state;
+ whisper_free_state(state);
return nullptr;
#endif
} else {
// conv allocator
{
- whisper_allocr_graph_init(state->alloc_conv, ctx->backend,
+ bool ok = whisper_allocr_graph_init(state->alloc_conv, ctx->backend,
[&]() {
return whisper_build_graph_conv(*ctx, *state, 0);
});
+ if (!ok) {
+ WHISPER_LOG_ERROR("%s: failed to init conv allocator\n", __func__);
+ whisper_free_state(state);
+ return nullptr;
+ }
+
WHISPER_LOG_INFO("%s: compute buffer (conv) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_conv) / 1e6);
}
// encoder allocator
if (!whisper_encode_external(*state)) {
- whisper_allocr_graph_init(state->alloc_encode, ctx->backend,
+ bool ok = whisper_allocr_graph_init(state->alloc_encode, ctx->backend,
[&]() {
return whisper_build_graph_encoder(*ctx, *state);
});
+ if (!ok) {
+ WHISPER_LOG_ERROR("%s: failed to init encoder allocator\n", __func__);
+ whisper_free_state(state);
+ return nullptr;
+ }
+
WHISPER_LOG_INFO("%s: compute buffer (encode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_encode) / 1e6);
}
// cross allocator
{
- whisper_allocr_graph_init(state->alloc_cross, ctx->backend,
+ bool ok = whisper_allocr_graph_init(state->alloc_cross, ctx->backend,
[&]() {
return whisper_build_graph_cross(*ctx, *state);
});
+ if (!ok) {
+ WHISPER_LOG_ERROR("%s: failed to init cross allocator\n", __func__);
+ whisper_free_state(state);
+ return nullptr;
+ }
+
WHISPER_LOG_INFO("%s: compute buffer (cross) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_cross) / 1e6);
}
// decoder allocator
{
- whisper_allocr_graph_init(state->alloc_decode, ctx->backend,
+ bool ok = whisper_allocr_graph_init(state->alloc_decode, ctx->backend,
[&]() {
const auto & hparams = ctx->model.hparams;
whisper_batch_prep_legacy(state->batch, nullptr, n_tokens, n_past, 0);
- return whisper_build_graph_decoder(*ctx, *state, state->batch);
+ return whisper_build_graph_decoder(*ctx, *state, state->batch, true);
});
+ if (!ok) {
+ WHISPER_LOG_ERROR("%s: failed to init decoder allocator\n", __func__);
+ whisper_free_state(state);
+ return nullptr;
+ }
+
WHISPER_LOG_INFO("%s: compute buffer (decode) = %7.2f MB\n", __func__, whisper_allocr_size(state->alloc_decode) / 1e6);
}
- whisper_allocr_graph_realloc(state->alloc_conv, ctx->backend);
- whisper_allocr_graph_realloc(state->alloc_encode, ctx->backend);
- whisper_allocr_graph_realloc(state->alloc_cross, ctx->backend);
- whisper_allocr_graph_realloc(state->alloc_decode, ctx->backend);
-
return state;
}
return whisper_init_with_params_no_state(loader, whisper_context_default_params());
}
-void whisper_free_state(struct whisper_state * state)
-{
+void whisper_free_state(struct whisper_state * state) {
if (state) {
kv_cache_free(state->kv_self);
kv_cache_free(state->kv_cross);
whisper_batch_free(state->batch);
- whisper_allocr_free(state->alloc_conv);
- whisper_allocr_free(state->alloc_encode);
- whisper_allocr_free(state->alloc_cross);
- whisper_allocr_free(state->alloc_decode);
+ ggml_gallocr_free(state->alloc_conv.alloc);
+ ggml_gallocr_free(state->alloc_encode.alloc);
+ ggml_gallocr_free(state->alloc_cross.alloc);
+ ggml_gallocr_free(state->alloc_decode.alloc);
ggml_backend_free(state->backend);
void whisper_free(struct whisper_context * ctx) {
if (ctx) {
- if (ctx->model.ctx) {
- ggml_free(ctx->model.ctx);
- }
+ ggml_free(ctx->model.ctx);
- for (auto & buffer : ctx->model.buffers) {
- if (buffer) {
- ggml_backend_buffer_free(buffer);
- }
- }
+ ggml_backend_buffer_free(ctx->model.buffer);
whisper_free_state(ctx->state);
extern "C" {
#endif
-struct ggml_backend;
-struct ggml_backend_buffer;
-struct ggml_backend_buffer_type;
+typedef struct ggml_backend_buffer_type * ggml_backend_buffer_type_t;
+typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+typedef struct ggml_backend * ggml_backend_t;
-//
-// Legacy API
-//
-
-typedef struct ggml_allocr * ggml_allocr_t;
-
-// initialize allocator for use with CPU backend only
-GGML_API ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment);
-GGML_API ggml_allocr_t ggml_allocr_new_measure(size_t alignment);
-
-// initialize allocator for use with ggml-backend
-GGML_API ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
-GGML_API ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
-GGML_API ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend);
-
-GGML_API struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc);
-
-// tell the allocator to parse nodes following the order described in the list
-// you should call this if your graph are optimized to execute out-of-order
-GGML_API void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n);
-
-GGML_API void ggml_allocr_free (ggml_allocr_t alloc);
-GGML_API bool ggml_allocr_is_measure (ggml_allocr_t alloc);
-GGML_API void ggml_allocr_reset (ggml_allocr_t alloc);
-GGML_API void ggml_allocr_alloc (ggml_allocr_t alloc, struct ggml_tensor * tensor);
-GGML_API size_t ggml_allocr_max_size (ggml_allocr_t alloc);
-
-GGML_API size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph);
+// Tensor allocator
+typedef struct ggml_tallocr * ggml_tallocr_t;
-//
-// ggml-backend v2 API
-//
+GGML_API ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer);
+GGML_API void ggml_tallocr_free(ggml_tallocr_t talloc);
+GGML_API void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor);
-// Separate tensor and graph allocator objects
-// This is necessary for multi-backend allocation because the graph allocator needs to use multiple tensor allocators
-// The original API is kept as a wrapper around the new API
+// Graph allocator
+/*
+ Example usage:
+ ggml_gallocr_t galloc = ggml_gallocr_new(ggml_bacckend_cpu_buffer_type());
-// Tensor allocator
-typedef struct ggml_tallocr * ggml_tallocr_t;
+ // optional: create a worst-case graph and reserve the buffers to avoid reallocations
+ ggml_gallocr_reserve(galloc, build_graph(max_batch));
-GGML_API ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment);
-GGML_API ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment);
-GGML_API ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size);
-GGML_API ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size); // allocates an owned buffer
-GGML_API ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer);
-GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft);
-GGML_API ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend);
+ // allocate the graph
+ struct ggml_cgraph * graph = build_graph(batch);
+ ggml_gallocr_alloc_graph(galloc, graph);
-GGML_API struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t talloc);
+ printf("compute buffer size: %zu bytes\n", ggml_gallocr_get_buffer_size(galloc, 0));
-GGML_API void ggml_tallocr_free (ggml_tallocr_t talloc);
-GGML_API bool ggml_tallocr_is_measure (ggml_tallocr_t talloc);
-GGML_API void ggml_tallocr_reset (ggml_tallocr_t talloc);
-GGML_API void ggml_tallocr_alloc (ggml_tallocr_t talloc, struct ggml_tensor * tensor);
-GGML_API size_t ggml_tallocr_max_size (ggml_tallocr_t talloc);
+ // evaluate the graph
+ ggml_backend_graph_compute(backend, graph);
+*/
+// special tensor flags for use with the graph allocator:
+// ggml_set_input(): all input tensors are allocated at the beginning of the graph in non-overlapping addresses
+// ggml_set_output(): output tensors are never freed and never overwritten
-// Graph allocator
typedef struct ggml_gallocr * ggml_gallocr_t;
-GGML_API ggml_gallocr_t ggml_gallocr_new(void);
-GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
+GGML_API ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft);
+GGML_API ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs);
+GGML_API void ggml_gallocr_free(ggml_gallocr_t galloc);
-GGML_API void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n);
-GGML_API size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph);
+// pre-allocate buffers from a measure graph - does not allocate or modify the graph
+// call with a worst-case graph to avoid buffer reallocations
+// not strictly required for single buffer usage: ggml_gallocr_alloc_graph will reallocate the buffers automatically if needed
+// returns false if the buffer allocation failed
+GGML_API bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+GGML_API bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids);
-// Allocate tensors from the allocators given by the hash table
-GGML_API void ggml_gallocr_alloc_graph_n(
- ggml_gallocr_t galloc,
- struct ggml_cgraph * graph,
- struct ggml_hash_set hash_set,
- ggml_tallocr_t * hash_node_talloc);
+// automatic reallocation if the topology changes when using a single buffer
+// returns false if using multiple buffers and a re-allocation is needed (call ggml_gallocr_reserve_n first to set the node buffers)
+GGML_API bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph);
+GGML_API size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id);
// Utils
// Create a buffer and allocate all the tensors in a ggml_context
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, struct ggml_backend_buffer_type * buft);
-GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, struct ggml_backend * backend);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft);
+GGML_API struct ggml_backend_buffer * ggml_backend_alloc_ctx_tensors(struct ggml_context * ctx, ggml_backend_t backend);
#ifdef __cplusplus
}
// in build_graph:
build_graph(...) {
- // allocating tensors in a specific backend (optional, recommended: pre-allocate inputs in a different buffer)
- alloc_cpu = ggml_backend_sched_get_allocr(sched, backend_cpu);
- ggml_allocr_alloc(alloc_cpu, tensor);
-
- // manually assigning nodes to a backend (optional, shouldn't be needed in most cases)
+ // manually assign nodes to a backend (optional, should not be needed in most cases)
struct ggml_tensor * node = ggml_mul_mat(ctx, ...);
ggml_backend_sched_set_node_backend(sched, node, backend_gpu);
}
GGML_API ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size);
GGML_API void ggml_backend_sched_free(ggml_backend_sched_t sched);
// Initialize backend buffers from a measure graph
- GGML_API void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
+ GGML_API bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph);
// Get the number of splits of the last graph
GGML_API int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched);
- GGML_API ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend);
- GGML_API ggml_backend_buffer_t ggml_backend_sched_get_buffer (ggml_backend_sched_t sched, ggml_backend_t backend);
+ GGML_API size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend);
GGML_API void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend);
GGML_API ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node);
// Allocate and compute graph on the backend scheduler
- GGML_API void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
+ GGML_API bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph);
- // Reset all assignments and allocators - must be called before using the sched allocators to allocate inputs
+ // Reset all assignments and allocators - must be called before changing the node backends
GGML_API void ggml_backend_sched_reset(ggml_backend_sched_t sched);
// Set a callback to be called for each resulting node during graph compute
enum ggml_log_level {
GGML_LOG_LEVEL_ERROR = 2,
- GGML_LOG_LEVEL_WARN = 3,
- GGML_LOG_LEVEL_INFO = 4,
+ GGML_LOG_LEVEL_WARN = 3,
+ GGML_LOG_LEVEL_INFO = 4,
GGML_LOG_LEVEL_DEBUG = 5
};
+ enum ggml_tensor_flag {
+ GGML_TENSOR_FLAG_INPUT = 1,
+ GGML_TENSOR_FLAG_OUTPUT = 2,
+ GGML_TENSOR_FLAG_PARAM = 4,
+ };
+
// ggml object
struct ggml_object {
size_t offs;
// op params - allocated as int32_t for alignment
int32_t op_params[GGML_MAX_OP_PARAMS / sizeof(int32_t)];
- bool is_param;
+ int32_t flags;
struct ggml_tensor * grad;
struct ggml_tensor * src[GGML_MAX_SRC];
ggml_opt_callback callback,
void * callback_data);
+ //
+ // tensor flags
+ //
+ GGML_API void ggml_set_input(struct ggml_tensor * tensor);
+ GGML_API void ggml_set_output(struct ggml_tensor * tensor);
+
//
// quantization
//
//#define AT_PRINTF(...) fprintf(stderr, __VA_ARGS__)
#define AT_PRINTF(...)
+
+static bool ggml_is_view(const struct ggml_tensor * t) {
+ return t->view_src != NULL;
+}
+
+static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
+ if (a->type != b->type) {
+ return false;
+ }
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
+ if (a->ne[i] != b->ne[i]) {
+ return false;
+ }
+ if (a->nb[i] != b->nb[i]) {
+ return false;
+ }
+ }
+ return true;
+}
+
+static bool ggml_op_can_inplace(enum ggml_op op) {
+ switch (op) {
+ case GGML_OP_SCALE:
+ case GGML_OP_DIAG_MASK_ZERO:
+ case GGML_OP_DIAG_MASK_INF:
+ case GGML_OP_ADD:
+ case GGML_OP_ADD1:
+ case GGML_OP_SUB:
+ case GGML_OP_MUL:
+ case GGML_OP_DIV:
+ case GGML_OP_SQR:
+ case GGML_OP_SQRT:
+ case GGML_OP_LOG:
+ case GGML_OP_UNARY:
+ case GGML_OP_ROPE:
+ case GGML_OP_RMS_NORM:
+ case GGML_OP_SOFT_MAX:
+ return true;
+
+ default:
+ return false;
+ }
+}
+
// TODO: GGML_PAD ?
static size_t aligned_offset(const void * buffer, size_t offset, size_t alignment) {
assert(alignment && !(alignment & (alignment - 1))); // power of 2
return offset + align;
}
+// tallocr
+struct ggml_tallocr {
+ ggml_backend_buffer_t buffer;
+ void * base;
+ size_t alignment;
+ size_t offset;
+};
+
+ggml_tallocr_t ggml_tallocr_new(ggml_backend_buffer_t buffer) {
+ ggml_tallocr_t talloc = malloc(sizeof(struct ggml_tallocr));
+ if (talloc == NULL) {
+ return NULL;
+ }
+
+ void * base = ggml_backend_buffer_get_base(buffer);
+ size_t align = ggml_backend_buffer_get_alignment(buffer);
+
+ assert(align && !(align & (align - 1))); // power of 2
+
+ *talloc = (struct ggml_tallocr) {
+ /*.buffer = */ buffer,
+ /*.base = */ base,
+ /*.alignment = */ align,
+ /*.offset = */ aligned_offset(base, 0, align),
+ };
+ return talloc;
+}
+
+void ggml_tallocr_free(ggml_tallocr_t talloc) {
+ free(talloc);
+}
+
+void ggml_tallocr_alloc(ggml_tallocr_t talloc, struct ggml_tensor * tensor) {
+ size_t size = ggml_backend_buffer_get_alloc_size(talloc->buffer, tensor);
+ size = GGML_PAD(size, talloc->alignment);
+
+ if (talloc->offset + size > ggml_backend_buffer_get_size(talloc->buffer)) {
+ fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, available %zu)\n",
+ __func__, tensor->name, size, ggml_backend_buffer_get_size(talloc->buffer) - talloc->offset);
+ GGML_ASSERT(!"not enough space in the buffer");
+ return;
+ }
+
+ void * addr = (char *)ggml_backend_buffer_get_base(talloc->buffer) + talloc->offset;
+ talloc->offset += size;
+
+ assert(((uintptr_t)addr % talloc->alignment) == 0);
+
+ ggml_backend_tensor_alloc(talloc->buffer, tensor, addr);
+}
+
+// dynamic tensor allocator
+
struct free_block {
- void * addr;
+ size_t offset;
size_t size;
};
-struct ggml_tallocr {
- struct ggml_backend_buffer * buffer;
- bool buffer_owned;
- void * base;
+struct ggml_dyn_tallocr {
size_t alignment;
-
int n_free_blocks;
struct free_block free_blocks[MAX_FREE_BLOCKS];
-
size_t max_size;
- bool measure;
-
#ifdef GGML_ALLOCATOR_DEBUG
- struct ggml_tensor * allocated_tensors[1024];
+ struct {
+ const struct ggml_tensor * tensor;
+ size_t offset;
+ } allocated_tensors[1024];
#endif
};
#ifdef GGML_ALLOCATOR_DEBUG
-static void add_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
+static void add_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) {
- if (alloc->allocated_tensors[i] == NULL) {
- alloc->allocated_tensors[i] = tensor;
+ if (alloc->allocated_tensors[i].tensor == NULL) {
+ alloc->allocated_tensors[i].tensor = tensor;
+ alloc->allocated_tensors[i].offset = offset;
return;
}
}
GGML_ASSERT(!"out of allocated_tensors");
}
-static void remove_allocated_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
+static void remove_allocated_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, const struct ggml_tensor * tensor) {
for (int i = 0; i < 1024; i++) {
- if (alloc->allocated_tensors[i] == tensor ||
- (alloc->allocated_tensors[i] != NULL && alloc->allocated_tensors[i]->data == tensor->data)) {
- alloc->allocated_tensors[i] = NULL;
+ if (alloc->allocated_tensors[i].offset == offset) {
+ alloc->allocated_tensors[i].tensor = NULL;
return;
}
}
- printf("tried to free tensor %s not found\n", tensor->name);
+ fprintf(stderr, "tried to free tensor %s not found\n", tensor->name);
GGML_ASSERT(!"tensor not found");
}
#endif
-// check if a tensor is allocated by this buffer
-static bool ggml_tallocr_is_own(ggml_tallocr_t alloc, const struct ggml_tensor * tensor) {
- return tensor->buffer == alloc->buffer && (!tensor->view_src || tensor->view_src->buffer == alloc->buffer);
-}
-
-static bool ggml_is_view(struct ggml_tensor * t) {
- return t->view_src != NULL;
-}
-
-void ggml_tallocr_alloc(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
- GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
- GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
-
- size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
+static size_t ggml_dyn_tallocr_alloc(struct ggml_dyn_tallocr * alloc, size_t size, const struct ggml_tensor * tensor) {
size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
if (block->size >= size) {
best_fit_block = alloc->n_free_blocks - 1;
} else {
- fprintf(stderr, "%s: not enough space in the buffer to allocate %s (needed %zu, largest block available %zu)\n",
- __func__, tensor->name, size, max_avail);
+ // this should never happen
+ fprintf(stderr, "%s: not enough space in the buffer to allocate %zu bytes, largest block available %zu bytes\n",
+ __func__, size, max_avail);
GGML_ASSERT(!"not enough space in the buffer");
- return;
+ GGML_UNREACHABLE();
}
}
struct free_block * block = &alloc->free_blocks[best_fit_block];
- void * addr = block->addr;
- block->addr = (char*)block->addr + size;
+ size_t offset = block->offset;
+ block->offset = offset + size;
block->size -= size;
if (block->size == 0) {
// remove block if empty
}
}
- AT_PRINTF("block %d, addr %p\n", best_fit_block, addr);
-
- tensor->data = addr;
- tensor->buffer = alloc->buffer;
- if (!alloc->measure) {
- ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
- }
+ AT_PRINTF("block %d, offset %zu\n", best_fit_block, offset);
#ifdef GGML_ALLOCATOR_DEBUG
- add_allocated_tensor(alloc, tensor);
- size_t cur_max = (char*)addr - (char*)alloc->base + size;
+ add_allocated_tensor(alloc, offset, tensor);
+ size_t cur_max = offset + size;
if (cur_max > alloc->max_size) {
- printf("max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
+ // sort allocated_tensors by offset
+ for (int i = 0; i < 1024; i++) {
+ for (int j = i + 1; j < 1024; j++) {
+ if (alloc->allocated_tensors[i].offset > alloc->allocated_tensors[j].offset) {
+ const struct ggml_tensor * tmp_tensor = alloc->allocated_tensors[i].tensor;
+ size_t tmp_offset = alloc->allocated_tensors[i].offset;
+ alloc->allocated_tensors[i].tensor = alloc->allocated_tensors[j].tensor;
+ alloc->allocated_tensors[i].offset = alloc->allocated_tensors[j].offset;
+ alloc->allocated_tensors[j].tensor = tmp_tensor;
+ alloc->allocated_tensors[j].offset = tmp_offset;
+ }
+ }
+ }
+ fprintf(stderr, "max_size = %.2f MB: tensors: ", cur_max / 1024.0 / 1024.0);
for (int i = 0; i < 1024; i++) {
- if (alloc->allocated_tensors[i]) {
- printf("%s (%.2f MB) ", alloc->allocated_tensors[i]->name, ggml_nbytes(alloc->allocated_tensors[i]) / 1024.0 / 1024.0);
+ if (alloc->allocated_tensors[i].tensor) {
+ fprintf(stderr, "%s [%zx-%zx] (%.2f MB) ", alloc->allocated_tensors[i].tensor->name,
+ alloc->allocated_tensors[i].offset,
+ alloc->allocated_tensors[i].offset + ggml_nbytes(alloc->allocated_tensors[i].tensor),
+ ggml_nbytes(alloc->allocated_tensors[i].tensor) / 1024.0 / 1024.0);
}
}
- printf("\n");
+ fprintf(stderr, "\n");
}
#endif
- alloc->max_size = MAX(alloc->max_size, (char*)addr - (char*)alloc->base + size);
-}
+ alloc->max_size = MAX(alloc->max_size, offset + size);
-// this is a very naive implementation, but for our case the number of free blocks should be very small
-static void ggml_tallocr_free_tensor(ggml_tallocr_t alloc, struct ggml_tensor * tensor) {
- if (ggml_tallocr_is_own(alloc, tensor) == false) {
- // the tensor was not allocated in this buffer
- // this can happen because the graph allocator will try to free weights and other tensors from different buffers
- // the easiest way to deal with this is just to ignore it
- // AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, (void *)tensor->buffer, (void *)alloc->buffer);
- return;
- }
+ return offset;
- void * ptr = tensor->data;
+ GGML_UNUSED(tensor);
+}
- size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
+// this is a very naive implementation, but for our case the number of free blocks should be very small
+static void ggml_dyn_tallocr_free_tensor(struct ggml_dyn_tallocr * alloc, size_t offset, size_t size, const struct ggml_tensor * tensor) {
size = aligned_offset(NULL, size, alloc->alignment);
- AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
+
+ AT_PRINTF("%s: freeing %s at %zu (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, offset, size, alloc->n_free_blocks);
#ifdef GGML_ALLOCATOR_DEBUG
- remove_allocated_tensor(alloc, tensor);
+ remove_allocated_tensor(alloc, offset, tensor);
#endif
// see if we can merge with an existing block
for (int i = 0; i < alloc->n_free_blocks; i++) {
struct free_block * block = &alloc->free_blocks[i];
// check if ptr is at the end of the block
- if ((char*)block->addr + block->size == ptr) {
+ if (block->offset + block->size == offset) {
block->size += size;
// check if we can merge with the next block
- if (i < alloc->n_free_blocks - 1 && (char*)block->addr + block->size == alloc->free_blocks[i+1].addr) {
+ if (i < alloc->n_free_blocks - 1 && block->offset + block->size == alloc->free_blocks[i+1].offset) {
block->size += alloc->free_blocks[i+1].size;
alloc->n_free_blocks--;
for (int j = i+1; j < alloc->n_free_blocks; j++) {
return;
}
// check if ptr is at the beginning of the block
- if ((char*)ptr + size == block->addr) {
- block->addr = ptr;
+ if (offset + size == block->offset) {
+ block->offset = offset;
block->size += size;
// check if we can merge with the previous block
- if (i > 0 && (char*)alloc->free_blocks[i-1].addr + alloc->free_blocks[i-1].size == block->addr) {
+ if (i > 0 && alloc->free_blocks[i-1].offset + alloc->free_blocks[i-1].size == block->offset) {
alloc->free_blocks[i-1].size += block->size;
alloc->n_free_blocks--;
for (int j = i; j < alloc->n_free_blocks; j++) {
GGML_ASSERT(alloc->n_free_blocks < MAX_FREE_BLOCKS && "out of free blocks");
// insert the new block in the correct position to keep the array sorted by address (to make merging blocks faster)
int insert_pos = 0;
- while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].addr < ptr) {
+ while (insert_pos < alloc->n_free_blocks && alloc->free_blocks[insert_pos].offset < offset) {
insert_pos++;
}
// shift all blocks from insert_pos onward to make room for the new block
alloc->free_blocks[i] = alloc->free_blocks[i-1];
}
// insert the new block
- alloc->free_blocks[insert_pos].addr = ptr;
+ alloc->free_blocks[insert_pos].offset = offset;
alloc->free_blocks[insert_pos].size = size;
alloc->n_free_blocks++;
+
+ GGML_UNUSED(tensor);
}
-void ggml_tallocr_reset(ggml_tallocr_t alloc) {
+static void ggml_dyn_tallocr_reset(struct ggml_dyn_tallocr * alloc) {
alloc->n_free_blocks = 1;
- size_t align_offset = aligned_offset(alloc->base, 0, alloc->alignment);
- alloc->free_blocks[0].addr = (char *)alloc->base + align_offset;
-
- if (alloc->measure) {
- alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
- } else {
- alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
- ggml_backend_buffer_reset(alloc->buffer);
- }
+ alloc->free_blocks[0].offset = 0;
+ alloc->free_blocks[0].size = SIZE_MAX/2; // restrict maximum size of a measure allocator to half size_t max to avoid overflows
+ alloc->max_size = 0;
}
-ggml_tallocr_t ggml_tallocr_new(void * data, size_t size, size_t alignment) {
- struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(data, size);
-
- ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
+static struct ggml_dyn_tallocr * ggml_dyn_tallocr_new(size_t alignment) {
+ struct ggml_dyn_tallocr * alloc = (struct ggml_dyn_tallocr *)malloc(sizeof(struct ggml_dyn_tallocr));
- *alloc = (struct ggml_tallocr) {
- /*.buffer = */ buffer,
- /*.buffer_owned = */ true,
- /*.base = */ ggml_backend_buffer_get_base(buffer),
+ *alloc = (struct ggml_dyn_tallocr) {
/*.alignment = */ alignment,
/*.n_free_blocks = */ 0,
/*.free_blocks = */ {{0}},
/*.max_size = */ 0,
- /*.measure = */ false,
#ifdef GGML_ALLOCATOR_DEBUG
- /*.allocated_tensors = */ {0},
+ /*.allocated_tensors = */ {{0}},
#endif
};
- ggml_tallocr_reset(alloc);
-
- return alloc;
-}
-
-ggml_tallocr_t ggml_tallocr_new_measure(size_t alignment) {
- ggml_tallocr_t alloc = ggml_tallocr_new((void *)0x1000, SIZE_MAX/2, alignment);
- alloc->measure = true;
+ ggml_dyn_tallocr_reset(alloc);
return alloc;
}
-ggml_tallocr_t ggml_tallocr_new_measure_from_buft(struct ggml_backend_buffer_type * buft) {
- // create a backend buffer to get the correct tensor allocation sizes
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, 1);
-
- // TODO: move alloc initialization to a common ggml_tallocr_new_impl function
- ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
- alloc->buffer_owned = true;
- alloc->measure = true;
- ggml_tallocr_reset(alloc);
- return alloc;
-}
-
-ggml_tallocr_t ggml_tallocr_new_measure_from_backend(struct ggml_backend * backend) {
- return ggml_tallocr_new_measure_from_buft(ggml_backend_get_default_buffer_type(backend));
-}
-
-ggml_tallocr_t ggml_tallocr_new_from_buft(struct ggml_backend_buffer_type * buft, size_t size) {
- // create a backend buffer to get the correct tensor allocation sizes
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
- ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(buffer);
- alloc->buffer_owned = true;
- return alloc;
-}
-
-ggml_tallocr_t ggml_tallocr_new_from_backend(struct ggml_backend * backend, size_t size) {
- return ggml_tallocr_new_from_buft(ggml_backend_get_default_buffer_type(backend), size);
-}
-
-ggml_tallocr_t ggml_tallocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
- ggml_tallocr_t alloc = (ggml_tallocr_t)malloc(sizeof(struct ggml_tallocr));
-
- *alloc = (struct ggml_tallocr) {
- /*.buffer = */ buffer,
- /*.buffer_owned = */ false,
- /*.base = */ ggml_backend_buffer_get_base(buffer),
- /*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
- /*.n_free_blocks = */ 0,
- /*.free_blocks = */ {{0}},
- /*.max_size = */ 0,
- /*.measure = */ false,
-#ifdef GGML_ALLOCATOR_DEBUG
- /*.allocated_tensors = */ {0},
-#endif
- };
-
- ggml_tallocr_reset(alloc);
-
- return alloc;
-}
-
-struct ggml_backend_buffer * ggml_tallocr_get_buffer(ggml_tallocr_t alloc) {
- return alloc->buffer;
-}
-
-void ggml_tallocr_free(ggml_tallocr_t alloc) {
- if (alloc == NULL) {
- return;
- }
-
- if (alloc->buffer_owned) {
- ggml_backend_buffer_free(alloc->buffer);
- }
+static void ggml_dyn_tallocr_free(struct ggml_dyn_tallocr * alloc) {
free(alloc);
}
-bool ggml_tallocr_is_measure(ggml_tallocr_t alloc) {
- return alloc->measure;
+static size_t ggml_dyn_tallocr_max_size(struct ggml_dyn_tallocr * alloc) {
+ return alloc->max_size;
}
-size_t ggml_tallocr_max_size(ggml_tallocr_t alloc) {
- // FIXME: changes in the tensor sizes compared to the measure graph may cause allocations to fail
- // to avoid this, we add a 10% margin to the buffer size
- return alloc->max_size + alloc->max_size/10;
-}
+
+/////////////////////////////////////
// graph allocator
struct hash_node {
int n_children;
int n_views;
+ int buffer_id;
+ size_t offset; // offset within the buffer
+ bool allocated;
+};
+
+//
+struct tensor_alloc {
+ size_t offset;
+ size_t size_max; // 0 = pre-allocated, unused, or view
+};
+
+struct node_alloc {
+ int buffer_id;
+ struct tensor_alloc dst;
+ struct tensor_alloc src[GGML_MAX_SRC];
};
struct ggml_gallocr {
- ggml_tallocr_t talloc;
+ ggml_backend_buffer_type_t * bufts; // [n_buffers]
+ ggml_backend_buffer_t * buffers; // [n_buffers]
+ struct ggml_dyn_tallocr ** buf_tallocs; // [n_buffers]
+ int n_buffers;
+
struct ggml_hash_set hash_set;
- struct hash_node * hash_values;
- size_t hash_values_size;
- ggml_tallocr_t * hash_allocs;
- int * parse_seq;
- int parse_seq_len;
+ struct hash_node * hash_values; // [hash_set.size]
+
+ struct node_alloc * node_allocs; // [n_nodes]
+ int n_nodes;
};
-ggml_gallocr_t ggml_gallocr_new(void) {
- ggml_gallocr_t galloc = (ggml_gallocr_t)malloc(sizeof(struct ggml_gallocr));
-
- *galloc = (struct ggml_gallocr) {
- /*.talloc = */ NULL,
- /*.hash_set = */ {0},
- /*.hash_values = */ NULL,
- /*.hash_values_size = */ 0,
- /*.hash_allocs = */ NULL,
- /*.parse_seq = */ NULL,
- /*.parse_seq_len = */ 0,
- };
+ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
+ ggml_gallocr_t galloc = (ggml_gallocr_t)calloc(sizeof(struct ggml_gallocr), 1);
+ GGML_ASSERT(galloc != NULL);
+
+ galloc->bufts = calloc(sizeof(ggml_backend_buffer_type_t) * n_bufs, 1);
+ GGML_ASSERT(galloc->bufts != NULL);
+
+ galloc->buffers = calloc(sizeof(ggml_backend_buffer_t) * n_bufs, 1);
+ GGML_ASSERT(galloc->buffers != NULL);
+
+ galloc->buf_tallocs = calloc(sizeof(struct ggml_dyn_tallocr *) * n_bufs, 1);
+ GGML_ASSERT(galloc->buf_tallocs != NULL);
+
+ for (int i = 0; i < n_bufs; i++) {
+ galloc->bufts[i] = bufts[i];
+ galloc->buffers[i] = NULL;
+ size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
+ galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
+ }
+ galloc->n_buffers = n_bufs;
return galloc;
}
+ggml_gallocr_t ggml_gallocr_new(ggml_backend_buffer_type_t buft) {
+ return ggml_gallocr_new_n(&buft, 1);
+}
+
void ggml_gallocr_free(ggml_gallocr_t galloc) {
if (galloc == NULL) {
return;
}
- if (galloc->hash_set.keys != NULL) {
- free(galloc->hash_set.keys);
- }
- if (galloc->hash_values != NULL) {
- free(galloc->hash_values);
- }
- if (galloc->hash_allocs != NULL) {
- free(galloc->hash_allocs);
- }
- if (galloc->parse_seq != NULL) {
- free(galloc->parse_seq);
+ for (int i = 0; i < galloc->n_buffers; i++) {
+ if (galloc->buffers != NULL) {
+ ggml_backend_buffer_free(galloc->buffers[i]);
+ }
+ if (galloc->buf_tallocs != NULL) {
+ ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
+ }
}
+
+ free(galloc->hash_set.keys);
+ free(galloc->hash_values);
+ free(galloc->bufts);
+ free(galloc->buffers);
+ free(galloc->buf_tallocs);
+ free(galloc->node_allocs);
free(galloc);
}
-void ggml_gallocr_set_parse_seq(ggml_gallocr_t galloc, const int * list, int n) {
- free(galloc->parse_seq);
- galloc->parse_seq = malloc(sizeof(int) * n);
+typedef struct ggml_gallocr * ggml_gallocr_t;
- for (int i = 0; i < n; i++) {
- galloc->parse_seq[i] = list[i];
- }
- galloc->parse_seq_len = n;
-}
-
-static struct hash_node * hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
+static struct hash_node * ggml_gallocr_hash_get(ggml_gallocr_t galloc, struct ggml_tensor * t) {
size_t i = ggml_hash_find_or_insert(galloc->hash_set, t);
return &galloc->hash_values[i];
}
-static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
- if (a->type != b->type) {
- return false;
- }
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
- if (a->ne[i] != b->ne[i]) {
- return false;
- }
- if (a->nb[i] != b->nb[i]) {
- return false;
- }
- }
- return true;
+static bool ggml_gallocr_is_own(ggml_gallocr_t galloc, struct ggml_tensor * t) {
+ return ggml_gallocr_hash_get(galloc, t)->allocated;
}
-static bool ggml_op_can_inplace(enum ggml_op op) {
- switch (op) {
- case GGML_OP_SCALE:
- case GGML_OP_DIAG_MASK_ZERO:
- case GGML_OP_DIAG_MASK_INF:
- case GGML_OP_ADD:
- case GGML_OP_ADD1:
- case GGML_OP_SUB:
- case GGML_OP_MUL:
- case GGML_OP_DIV:
- case GGML_OP_SQR:
- case GGML_OP_SQRT:
- case GGML_OP_LOG:
- case GGML_OP_UNARY:
- case GGML_OP_ROPE:
- case GGML_OP_RMS_NORM:
- case GGML_OP_SOFT_MAX:
- return true;
-
- default:
- return false;
- }
+static void ggml_gallocr_set_node_offset(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, size_t offset) {
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
+ hn->buffer_id = buffer_id;
+ hn->offset = offset;
+ hn->allocated = true;
}
-static ggml_tallocr_t node_tallocr(ggml_gallocr_t galloc, struct ggml_tensor * node) {
- if (galloc->talloc != NULL) {
- return galloc->talloc;
- }
-
- return galloc->hash_allocs[ggml_hash_find_or_insert(galloc->hash_set, node)];
+static bool ggml_gallocr_is_allocated(ggml_gallocr_t galloc, struct ggml_tensor * t) {
+ return t->data != NULL || ggml_gallocr_hash_get(galloc, t)->allocated;
}
-static void init_view(ggml_gallocr_t galloc, struct ggml_tensor * view, bool update_backend) {
- ggml_tallocr_t alloc = node_tallocr(galloc, view);
-
- GGML_ASSERT(view->view_src != NULL && view->view_src->data != NULL);
- if (update_backend) {
- view->backend = view->view_src->backend;
- }
- // views are initialized in the alloc buffer rather than the view_src buffer
- view->buffer = alloc->buffer;
- view->data = (char *)view->view_src->data + view->view_offs;
+static void ggml_gallocr_allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
- assert(ggml_tallocr_is_measure(alloc) || !view->buffer || view->buffer->buft == alloc->buffer->buft);
+ if (!ggml_gallocr_is_allocated(galloc, node) && !ggml_is_view(node)) {
+ hn->allocated = true;
+ assert(hn->offset == 0);
- if (!alloc->measure) {
- ggml_backend_buffer_init_tensor(alloc->buffer, view);
- }
-}
+ // try to reuse a parent's buffer (inplace)
+ if (ggml_op_can_inplace(node->op)) {
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
+ struct ggml_tensor * parent = node->src[i];
+ if (parent == NULL) {
+ break;
+ }
-static void allocate_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
- ggml_tallocr_t alloc = node_tallocr(galloc, node);
+ // if the node's data is external, then we cannot re-use it
+ if (!ggml_gallocr_is_own(galloc, parent)) {
+ AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
+ continue;
+ }
- if (node->data == NULL) {
- if (ggml_is_view(node)) {
- init_view(galloc, node, true);
- } else {
- // see if we can reuse a parent's buffer (inplace)
- if (ggml_op_can_inplace(node->op)) {
- for (int i = 0; i < GGML_MAX_SRC; i++) {
- struct ggml_tensor * parent = node->src[i];
- if (parent == NULL) {
- break;
- }
+ // outputs cannot be reused
+ if (parent->flags & GGML_TENSOR_FLAG_OUTPUT || (parent->view_src != NULL && parent->view_src->flags & GGML_TENSOR_FLAG_OUTPUT)) {
+ AT_PRINTF("not reusing parent %s for %s as it is an output\n", parent->name, node->name);
+ continue;
+ }
- // if the node's data is external, then we cannot re-use it
- if (ggml_tallocr_is_own(alloc, parent) == false) {
- AT_PRINTF("not reusing parent %s for %s as %p is external\n", parent->name, node->name, parent->data);
- continue;
- }
+ if (!ggml_are_same_layout(node, parent)) {
+ AT_PRINTF("not reusing parent %s for %s as layouts are different\n", parent->name, node->name);
+ continue;
+ }
- struct hash_node * p_hn = hash_get(galloc, parent);
- if (parent->data != NULL && p_hn->n_children == 1 && p_hn->n_views == 0 && ggml_are_same_layout(node, parent)) {
- if (ggml_is_view(parent)) {
- struct ggml_tensor * view_src = parent->view_src;
- struct hash_node * view_src_hn = hash_get(galloc, view_src);
- if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
- // TODO: the offset of the view parent must be kept to ensure that the op doesn't overwrite
- // the parent's data that it will need later (same layout requirement). the problem is that then
- // we cannot free the tensor because the original address of the allocation is lost.
- // adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
- // for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
- AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
- node->view_src = view_src;
- view_src_hn->n_views += 1;
- init_view(galloc, node, false);
- return;
- }
- } else {
- AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
- node->view_src = parent;
- p_hn->n_views += 1;
- init_view(galloc, node, false);
+ struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
+ if (p_hn->n_children == 1 && p_hn->n_views == 0) {
+ if (ggml_is_view(parent)) {
+ struct ggml_tensor * view_src = parent->view_src;
+ struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
+ if (view_src_hn->n_views == 1 && view_src_hn->n_children == 0 && view_src->data == parent->data) {
+ AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
+ assert(view_src_hn->offset == p_hn->offset);
+ hn->buffer_id = p_hn->buffer_id;
+ hn->offset = p_hn->offset;
+ p_hn->allocated = false; // avoid freeing the parent
+ view_src_hn->allocated = false;
return;
}
+ } else {
+ AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
+ hn->buffer_id = p_hn->buffer_id;
+ hn->offset = p_hn->offset;
+ p_hn->allocated = false; // avoid freeing the parent
+ return;
}
}
}
- ggml_tallocr_alloc(alloc, node);
}
+ // allocate tensor from the buffer
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
+ size_t size = ggml_backend_buft_get_alloc_size(buft, node);
+ size_t offset = ggml_dyn_tallocr_alloc(alloc, size, node);
+ hn->buffer_id = buffer_id;
+ hn->offset = offset;
+ return;
}
}
-static void free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
- ggml_tallocr_t alloc = node_tallocr(galloc, node);
+static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
+ // graph outputs are never freed
+ if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
+ AT_PRINTF("not freeing output %s\n", node->name);
+ return;
+ }
- ggml_tallocr_free_tensor(alloc, node);
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
+ size_t offset = hn->offset;
+ size_t size = ggml_backend_buft_get_alloc_size(buft, node);
+ ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
+ hn->allocated = false;
}
-static void ggml_tallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * gf) {
- const int * parse_seq = galloc->parse_seq;
- int parse_seq_len = galloc->parse_seq_len;
+static int get_node_buffer_id(const int * node_buffer_ids, int i) {
+ return node_buffer_ids ? node_buffer_ids[i] : 0;
+}
+
+static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
+ // clear hash tables
+ memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
+ memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
+
+ // allocate all graph inputs first to avoid overwriting them
+ for (int i = 0; i < graph->n_nodes; i++) {
+ if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
+ ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
+ }
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ if (graph->nodes[i]->src[j] == NULL) {
+ break;
+ }
+ if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
+ ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
+ }
+ }
+ }
// count number of children and views
- for (int i = 0; i < gf->n_nodes; i++) {
- struct ggml_tensor * node = gf->nodes[i];
+ for (int i = 0; i < graph->n_nodes; i++) {
+ struct ggml_tensor * node = graph->nodes[i];
if (ggml_is_view(node)) {
struct ggml_tensor * view_src = node->view_src;
- hash_get(galloc, view_src)->n_views += 1;
- if (node->buffer == NULL && node->data != NULL) {
- // view of a pre-allocated tensor, didn't call init_view() yet
- init_view(galloc, node, true);
- }
+ ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (parent == NULL) {
break;
}
- hash_get(galloc, parent)->n_children += 1;
- if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
- init_view(galloc, parent, true);
- }
+ ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
}
}
// allocate tensors
- // if we have parse_seq then we allocate nodes following the list, and we only free nodes at barriers
- int last_barrier_pos = 0;
- int n_nodes = parse_seq_len ? parse_seq_len : gf->n_nodes;
-
- for (int ind = 0; ind < n_nodes; ind++) {
- // allocate a node if there is no parse_seq or this is not a barrier
- if (parse_seq_len == 0 || parse_seq[ind] != -1) {
- int i = parse_seq_len ? parse_seq[ind] : ind;
- struct ggml_tensor * node = gf->nodes[i];
-
- // allocate parents (leafs)
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * parent = node->src[j];
- if (parent == NULL) {
- break;
- }
- allocate_node(galloc, parent);
+ for (int i = 0; i < graph->n_nodes; i++) {
+ struct ggml_tensor * node = graph->nodes[i];
+ int buffer_id = get_node_buffer_id(node_buffer_ids, i);
+
+ // allocate parents (only leafs need to be allocated at this point)
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ struct ggml_tensor * parent = node->src[j];
+ if (parent == NULL) {
+ break;
}
+ ggml_gallocr_allocate_node(galloc, parent, buffer_id);
+ }
- // allocate node
- allocate_node(galloc, node);
+ // allocate node
+ ggml_gallocr_allocate_node(galloc, node, buffer_id);
- AT_PRINTF("exec: %s (%s) <= ", ggml_op_name(node->op), node->name);
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * parent = node->src[j];
- if (parent == NULL) {
- break;
- }
- AT_PRINTF("%s", parent->name);
- if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
- AT_PRINTF(", ");
- }
+ AT_PRINTF("exec: %s (%s) <= ", ggml_op_desc(node), node->name);
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ struct ggml_tensor * parent = node->src[j];
+ if (parent == NULL) {
+ break;
+ }
+ AT_PRINTF("%s", parent->name);
+ if (j < GGML_MAX_SRC - 1 && node->src[j + 1] != NULL) {
+ AT_PRINTF(", ");
}
- AT_PRINTF("\n");
}
+ AT_PRINTF("\n");
// update parents
- // update immediately if there is no parse_seq
- // update only at barriers if there is parse_seq
- if ((parse_seq_len == 0) || parse_seq[ind] == -1) {
- int update_start = parse_seq_len ? last_barrier_pos : ind;
- int update_end = parse_seq_len ? ind : ind + 1;
- for (int i = update_start; i < update_end; i++) {
- int node_i = parse_seq_len ? parse_seq[i] : i;
- struct ggml_tensor * node = gf->nodes[node_i];
-
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * parent = node->src[j];
- if (parent == NULL) {
- break;
- }
- struct hash_node * p_hn = hash_get(galloc, parent);
- p_hn->n_children -= 1;
-
- //AT_PRINTF("parent %s: %d children, %d views\n", parent->name, parent->n_children, parent->n_views);
-
- if (p_hn->n_children == 0 && p_hn->n_views == 0) {
- if (ggml_is_view(parent)) {
- struct ggml_tensor * view_src = parent->view_src;
- struct hash_node * view_src_hn = hash_get(galloc, view_src);
- view_src_hn->n_views -= 1;
- AT_PRINTF("view_src %s: %d children, %d views\n", view_src->name, view_src_hn->n_children, view_src_hn->n_views);
- if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0) {
- free_node(galloc, view_src);
- }
- }
- else {
- free_node(galloc, parent);
- }
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ struct ggml_tensor * parent = node->src[j];
+ if (parent == NULL) {
+ break;
+ }
+ struct hash_node * p_hn = ggml_gallocr_hash_get(galloc, parent);
+ p_hn->n_children -= 1;
+
+ AT_PRINTF("parent %s: %d children, %d views, allocated: %d\n",
+ parent->name, p_hn->n_children, p_hn->n_views, p_hn->allocated);
+
+ if (p_hn->n_children == 0 && p_hn->n_views == 0) {
+ if (ggml_is_view(parent)) {
+ struct ggml_tensor * view_src = parent->view_src;
+ struct hash_node * view_src_hn = ggml_gallocr_hash_get(galloc, view_src);
+ view_src_hn->n_views -= 1;
+ AT_PRINTF("view_src %s: %d children, %d views\n",
+ view_src->name, view_src_hn->n_children, view_src_hn->n_views);
+ if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
+ ggml_gallocr_free_node(galloc, view_src, buffer_id);
}
}
+ else if (p_hn->allocated) {
+ ggml_gallocr_free_node(galloc, parent, buffer_id);
+ }
}
AT_PRINTF("\n");
- if (parse_seq_len) {
- last_barrier_pos = ind + 1;
- }
}
}
}
-size_t ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, ggml_tallocr_t talloc, struct ggml_cgraph * graph) {
+bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, const int * node_buffer_ids) {
size_t hash_size = graph->visited_hash_table.size;
- // check if the hash table is initialized and large enough
+ // initialize hash table
if (galloc->hash_set.size < hash_size) {
- if (galloc->hash_set.keys != NULL) {
- free(galloc->hash_set.keys);
- }
- if (galloc->hash_values != NULL) {
- free(galloc->hash_values);
- }
- galloc->hash_set.keys = malloc(sizeof(struct ggml_tensor *) * hash_size);
+ free(galloc->hash_set.keys);
+ free(galloc->hash_values);
galloc->hash_set.size = hash_size;
- galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
+ galloc->hash_set.keys = calloc(sizeof(struct ggml_tensor *), hash_size);
+ galloc->hash_values = calloc(sizeof(struct hash_node), hash_size);
+ GGML_ASSERT(galloc->hash_set.keys != NULL);
+ GGML_ASSERT(galloc->hash_values != NULL);
+ } else {
+ // reset hash table
+ memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * galloc->hash_set.size);
+ memset(galloc->hash_values, 0, sizeof(struct hash_node) * galloc->hash_set.size);
}
- // reset hash table
- memset(galloc->hash_set.keys, 0, sizeof(struct ggml_tensor *) * hash_size);
- memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
-
- galloc->talloc = talloc;
- ggml_tallocr_alloc_graph_impl(galloc, graph);
- galloc->talloc = NULL;
-
- size_t max_size = ggml_tallocr_max_size(talloc);
-
- return max_size;
-}
-
-void ggml_gallocr_alloc_graph_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, struct ggml_hash_set hash_set, ggml_tallocr_t * hash_node_talloc) {
- const size_t hash_size = hash_set.size;
-
- GGML_ASSERT(hash_size >= (size_t)(graph->n_nodes + graph->n_leafs));
+ // reset allocators
+ for (int i = 0; i < galloc->n_buffers; i++) {
+ ggml_dyn_tallocr_reset(galloc->buf_tallocs[i]);
+ }
- galloc->talloc = NULL;
+ // allocate in hash table
+ ggml_gallocr_alloc_graph_impl(galloc, graph, node_buffer_ids);
- // alloc hash_values if needed
- if (galloc->hash_values == NULL || galloc->hash_values_size < hash_size) {
- free(galloc->hash_values);
- galloc->hash_values = malloc(sizeof(struct hash_node) * hash_size);
- galloc->hash_values_size = hash_size;
+ // set the node_allocs from the hash table
+ if (galloc->n_nodes < graph->n_nodes) {
+ free(galloc->node_allocs);
+ galloc->node_allocs = calloc(sizeof(struct node_alloc), graph->n_nodes);
+ GGML_ASSERT(galloc->node_allocs != NULL);
}
-
- // free hash_set.keys if needed
- if (galloc->hash_set.keys != NULL) {
- free(galloc->hash_set.keys);
+ galloc->n_nodes = graph->n_nodes;
+ for (int i = 0; i < graph->n_nodes; i++) {
+ struct ggml_tensor * node = graph->nodes[i];
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
+ node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
+ if (node->view_src || node->data) {
+ node_alloc->dst.offset = SIZE_MAX;
+ node_alloc->dst.size_max = 0;
+ } else {
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
+ node_alloc->dst.offset = hn->offset;
+ node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
+ }
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ struct ggml_tensor * src = node->src[j];
+ if (!src || src->view_src || src->data) {
+ node_alloc->src[j].offset = SIZE_MAX;
+ node_alloc->src[j].size_max = 0;
+ } else {
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
+ node_alloc->src[j].offset = hn->offset;
+ node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
+ }
+ }
}
- galloc->hash_set = hash_set;
- // reset hash values
- memset(galloc->hash_values, 0, sizeof(struct hash_node) * hash_size);
+ // reallocate buffers if needed
+ for (int i = 0; i < galloc->n_buffers; i++) {
+ size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
+ size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
- galloc->hash_allocs = hash_node_talloc;
-
- ggml_tallocr_alloc_graph_impl(galloc, graph);
+ if (new_size > cur_size) {
+#ifndef NDEBUG
+ fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
+#endif
+ ggml_backend_buffer_free(galloc->buffers[i]);
+ galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
+ if (galloc->buffers[i] == NULL) {
+ fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), new_size);
+ return false;
+ }
+ }
+ }
- // remove unowned resources
- galloc->hash_set.keys = NULL;
- galloc->hash_allocs = NULL;
+ return true;
}
-// legacy API wrapper
-
-struct ggml_allocr {
- ggml_tallocr_t talloc;
- ggml_gallocr_t galloc;
-};
-
-static ggml_allocr_t ggml_allocr_new_impl(ggml_tallocr_t talloc) {
- ggml_allocr_t alloc = (ggml_allocr_t)malloc(sizeof(struct ggml_allocr));
- *alloc = (struct ggml_allocr) {
- /*.talloc = */ talloc,
- /*.galloc = */ ggml_gallocr_new(),
- };
- return alloc;
+bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
+ return ggml_gallocr_reserve_n(galloc, graph, NULL);
}
-ggml_allocr_t ggml_allocr_new(void * data, size_t size, size_t alignment) {
- return ggml_allocr_new_impl(ggml_tallocr_new(data, size, alignment));
-}
+static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
+ assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
-ggml_allocr_t ggml_allocr_new_measure(size_t alignment) {
- return ggml_allocr_new_impl(ggml_tallocr_new_measure(alignment));
-}
+ if (node->view_src != NULL) {
+ if (node->buffer == NULL) {
+ assert(tensor_alloc->offset == SIZE_MAX);
+ if (node->view_src->buffer == NULL) {
+ // this tensor was allocated without ggml-backend
+ return;
+ }
+ ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
+ }
+ } else {
+ if (node->data == NULL) {
+ assert(tensor_alloc->offset != SIZE_MAX);
+ assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
+ void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
+ void * addr = (char *)base + tensor_alloc->offset;
+ ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
+ } else {
+ if (node->buffer == NULL) {
+ // this tensor was allocated without ggml-backend
+ return;
+ }
-ggml_allocr_t ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
- return ggml_allocr_new_impl(ggml_tallocr_new_from_buffer(buffer));
+#ifndef NDEBUG
+ size_t offset =
+ (char *)node->data -
+ (char *)ggml_backend_buffer_get_base(node->buffer);
+ size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
+ assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
+ assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
+#endif
+ }
+ }
}
-ggml_allocr_t ggml_allocr_new_from_backend(struct ggml_backend * backend, size_t size) {
- return ggml_allocr_new_impl(ggml_tallocr_new_from_backend(backend, size));
+static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
+ ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
+ size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
+ return talloc->size_max >= node_size;
}
-ggml_allocr_t ggml_allocr_new_measure_from_backend(struct ggml_backend * backend) {
- return ggml_allocr_new_impl(ggml_tallocr_new_measure_from_backend(backend));
-}
+static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
+ if (galloc->n_nodes != graph->n_nodes) {
+#ifndef NDEBUG
+ fprintf(stderr, "%s: graph has different number of nodes\n", __func__);
+#endif
+ return true;
+ }
-struct ggml_backend_buffer * ggml_allocr_get_buffer(ggml_allocr_t alloc) {
- return ggml_tallocr_get_buffer(alloc->talloc);
-}
+ for (int i = 0; i < graph->n_nodes; i++) {
+ struct ggml_tensor * node = graph->nodes[i];
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
-void ggml_allocr_set_parse_seq(ggml_allocr_t alloc, const int * list, int n) {
- ggml_gallocr_set_parse_seq(alloc->galloc, list, n);
-}
+ if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
+#ifndef NDEBUG
+ fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
+#endif
+ return true;
+ }
-void ggml_allocr_free(ggml_allocr_t alloc) {
- if (alloc == NULL) {
- return;
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ struct ggml_tensor * src = node->src[j];
+ if (src == NULL) {
+ break;
+ }
+ if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
+#ifndef NDEBUG
+ fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
+#endif
+ return true;
+ }
+ }
}
- ggml_gallocr_free(alloc->galloc);
- ggml_tallocr_free(alloc->talloc);
- free(alloc);
+ return false;
}
-bool ggml_allocr_is_measure(ggml_allocr_t alloc) {
- return ggml_tallocr_is_measure(alloc->talloc);
-}
+bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph) {
+ if (ggml_gallocr_needs_realloc(galloc, graph)) {
+ if (galloc->n_buffers == 1) {
+#ifndef NDEBUG
+ fprintf(stderr, "%s: reallocating buffers automatically\n", __func__);
+#endif
+ if (!ggml_gallocr_reserve(galloc, graph)) {
+ return false;
+ }
+ } else {
+#ifndef NDEBUG
+ fprintf(stderr, "%s: cannot reallocate multi buffer graph automatically, call reserve\n", __func__);
+#endif
+ return false;
+ }
+ }
-void ggml_allocr_reset(ggml_allocr_t alloc) {
- ggml_tallocr_reset(alloc->talloc);
-}
+ // reset buffers
+ for (int i = 0; i < galloc->n_buffers; i++) {
+ // zero size buffers are not allocated
+ if (galloc->buffers[i] != NULL) {
+ ggml_backend_buffer_reset(galloc->buffers[i]);
+ }
+ }
-void ggml_allocr_alloc(ggml_allocr_t alloc, struct ggml_tensor * tensor) {
- ggml_tallocr_alloc(alloc->talloc, tensor);
-}
+ // allocate the graph tensors from the previous assignments
+ for (int i = 0; i < graph->n_nodes; i++) {
+ struct ggml_tensor * node = graph->nodes[i];
+ struct node_alloc * node_alloc = &galloc->node_allocs[i];
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ struct ggml_tensor * src = node->src[j];
+ if (src == NULL) {
+ break;
+ }
+ ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
+ }
+ ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
+ }
-size_t ggml_allocr_max_size(ggml_allocr_t alloc) {
- return ggml_tallocr_max_size(alloc->talloc);
+ return true;
}
-size_t ggml_allocr_alloc_graph(ggml_allocr_t alloc, struct ggml_cgraph * graph) {
- return ggml_gallocr_alloc_graph(alloc->galloc, alloc->talloc, graph);
+size_t ggml_gallocr_get_buffer_size(ggml_gallocr_t galloc, int buffer_id) {
+ GGML_ASSERT(buffer_id >= 0 && buffer_id < galloc->n_buffers);
+
+ if (galloc->buffers[buffer_id] == NULL) {
+ return 0;
+ }
+ return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
}
// utils
return false;
}
- ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
+ struct ggml_tallocr * tallocr = ggml_tallocr_new(buffer);
for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
if (t->view_src == NULL) {
ggml_tallocr_alloc(tallocr, t);
- } else {
+ } else if (t->buffer == NULL) {
ggml_backend_view_init(buffer, t);
}
} else {
- if (t->view_src != NULL) {
+ if (t->view_src != NULL && t->buffer == NULL) {
// view of a pre-allocated tensor
ggml_backend_view_init(buffer, t);
}
}
if (this_size > max_size) {
- // tensor is too large to fit in a single buffer
fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
__func__, t->name,
ggml_backend_buft_name(buft),
}
if (n_buffers == 0) {
- // all the tensors in the context are already allocated
#ifndef NDEBUG
fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
#endif
// backend CPU
+static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
+
GGML_CALL static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
return "CPU";
}
GGML_CALL static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
- return (void *)buffer->context;
+ uintptr_t data = (uintptr_t)buffer->context;
+
+ // align the buffer
+ if (data % TENSOR_ALIGNMENT != 0) {
+ data = GGML_PAD(data, TENSOR_ALIGNMENT);
+ }
+
+ return (void *)data;
}
GGML_CALL static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
/* .reset = */ NULL,
};
-static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
-
GGML_CALL static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "CPU";
GGML_CALL static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
- void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
-
- GGML_ASSERT(data != NULL && "failed to allocate buffer");
+ void * data = malloc(size); // TODO: use GGML_ALIGNED_MALLOC (move to ggml-impl.h)
+ if (data == NULL) {
+ fprintf(stderr, "%s: failed to allocate buffer of size %zu\n", __func__, size);
+ return NULL;
+ }
return ggml_backend_buffer_init(buft, cpu_backend_buffer_i, data, size);
}
ggml_backend_t ggml_backend_cpu_init(void) {
struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
+ if (ctx == NULL) {
+ return NULL;
+ }
ctx->n_threads = GGML_DEFAULT_N_THREADS;
ctx->work_data = NULL;
ctx->abort_callback_data = NULL;
ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
+ if (cpu_backend == NULL) {
+ free(ctx);
+ return NULL;
+ }
*cpu_backend = (struct ggml_backend) {
/* .interface = */ cpu_backend_i,
ctx->n_buffers = n_buffers;
ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
+ GGML_ASSERT(ctx->buffers != NULL);
+
size_t total_size = 0;
for (size_t i = 0; i < n_buffers; i++) {
ctx->buffers[i] = buffers[i];
}
}
+// creates a copy of the tensor with the same memory layout
+static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
+ struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
+ dup->nb[i] = tensor->nb[i];
+ }
+ return dup;
+}
+
+static bool ggml_is_view_op(enum ggml_op op) {
+ return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
+}
// scheduler
#define GGML_MAX_SPLIT_INPUTS 16
struct ggml_backend_sched_split {
- ggml_tallocr_t tallocr;
+ int backend_id;
int i_start;
int i_end;
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
int n_backends;
ggml_backend_t backends[GGML_MAX_BACKENDS];
ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
- ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
ggml_gallocr_t galloc;
// hash keys of the nodes in the graph
struct ggml_hash_set hash_set;
- // hash values (arrays of [hash_set.size])
- ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend)
- struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
+ // hash values
+ int * tensor_backend_id;
+ struct ggml_tensor * (* tensor_copies)[GGML_MAX_BACKENDS];
+
+ int * node_backend_ids; // [n_nodes]
+ int n_nodes;
// copy of the graph with modified inputs
struct ggml_cgraph * graph;
struct ggml_context * ctx;
+ ggml_backend_sched_eval_callback callback_eval;
+ void * callback_eval_user_data;
+
// align context_buffer to GGML_MEM_ALIGN
#ifdef _MSC_VER
__declspec(align(GGML_MEM_ALIGN))
#else
__attribute__((aligned(GGML_MEM_ALIGN)))
#endif
- char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
-
- ggml_backend_sched_eval_callback callback_eval;
- void * callback_eval_user_data;
+ char context_buffer[GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS*2*sizeof(struct ggml_tensor) + sizeof(struct ggml_cgraph)];
};
#define hash_id(node) ggml_hash_find_or_insert(sched->hash_set, node)
-#define node_allocr(node) sched->node_talloc[hash_id(node)]
-
-static bool ggml_is_view_op(enum ggml_op op) {
- return op == GGML_OP_VIEW || op == GGML_OP_RESHAPE || op == GGML_OP_PERMUTE || op == GGML_OP_TRANSPOSE;
-}
+#define tensor_backend_id(node) sched->tensor_backend_id[hash_id(node)]
+#define tensor_backend(node) (tensor_backend_id(node) == -1 ? NULL : sched->backends[tensor_backend_id(node)])
-// returns the priority of the backend, lower is better
-static int sched_backend_prio(ggml_backend_sched_t sched, ggml_backend_t backend) {
+// returns the priority of the backend, lower id is higher priority
+static int ggml_backend_sched_backend_id(ggml_backend_sched_t sched, ggml_backend_t backend) {
for (int i = 0; i < sched->n_backends; i++) {
if (sched->backends[i] == backend) {
return i;
}
}
- return INT_MAX;
+ return -1;
}
-static int sched_allocr_prio(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
- for (int i = 0; i < sched->n_backends; i++) {
- if (sched->tallocs[i] == allocr) {
- return i;
- }
- }
- return INT_MAX;
-}
-
-static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
+static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
if (buffer == NULL) {
- return NULL;
- }
-
- // check if this is already allocate in a allocr buffer (from user manual allocations)
- for (int i = 0; i < sched->n_backends; i++) {
- if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
- return sched->tallocs[i];
- }
+ return -1;
}
// find highest prio backend that supports the buffer type
for (int i = 0; i < sched->n_backends; i++) {
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
- return sched->tallocs[i];
+ return i;
}
}
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
}
-static ggml_backend_t get_allocr_backend(ggml_backend_sched_t sched, ggml_tallocr_t allocr) {
- if (allocr == NULL) {
- return NULL;
- }
- for (int i = 0; i < sched->n_backends; i++) {
- if (sched->tallocs[i] == allocr) {
- return sched->backends[i];
- }
- }
- GGML_UNREACHABLE();
-}
-
#if 0
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
#endif
// returns the backend that should be used for the node based on the current locations
-static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
+static int ggml_backend_sched_backend_id_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * tensor) {
+ // TODO: use supports_op to check if the backend supports the op
+
// assign pre-allocated nodes to their backend
// dst
- ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
- if (cur_allocr != NULL) {
+ int cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->buffer);
+ if (cur_backend != -1) {
SET_CAUSE(node, "1.dst");
- return cur_allocr;
+ return cur_backend;
}
// view_src
- if (node->view_src != NULL) {
- cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
- if (cur_allocr != NULL) {
+ if (tensor->view_src != NULL) {
+ cur_backend = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src->buffer);
+ if (cur_backend != -1) {
SET_CAUSE(node, "1.vsrc");
- return cur_allocr;
+ return cur_backend;
}
}
// assign nodes that use weights to the backend of the weights
for (int i = 0; i < GGML_MAX_SRC; i++) {
- const struct ggml_tensor * src = node->src[i];
+ const struct ggml_tensor * src = tensor->src[i];
if (src == NULL) {
break;
}
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
- ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
+ int src_backend = ggml_backend_sched_backend_from_buffer(sched, src->buffer);
// operations with weights are always run on the same backend as the weights
SET_CAUSE(node, "1.wgt%d", i);
- return src_allocr;
+ return src_backend;
}
}
- return NULL;
+ return -1;
}
static char * fmt_size(size_t size) {
return buffer;
}
-static void sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
+static void ggml_backend_sched_print_assignments(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
int cur_split = 0;
for (int i = 0; i < graph->n_nodes; i++) {
if (cur_split < sched->n_splits && i == sched->splits[cur_split].i_start) {
- ggml_backend_t split_backend = get_allocr_backend(sched, sched->splits[cur_split].tallocr);
+ ggml_backend_t split_backend = sched->backends[sched->splits[cur_split].backend_id];
fprintf(stderr, "\n## SPLIT #%d: %s # %d inputs: ", cur_split, ggml_backend_name(split_backend),
sched->splits[cur_split].n_inputs);
for (int j = 0; j < sched->splits[cur_split].n_inputs; j++) {
if (ggml_is_view_op(node->op)) {
continue;
}
- ggml_tallocr_t node_allocr = node_allocr(node);
- ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
+ ggml_backend_t tensor_backend = tensor_backend(node);
fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
- fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
+ fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
break;
}
- ggml_tallocr_t src_allocr = node_allocr(src);
- ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
+ ggml_backend_t src_backend = tensor_backend(src);
fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
}
}
}
-// creates a copy of the tensor with the same memory layout
-static struct ggml_tensor * ggml_dup_tensor_layout(struct ggml_context * ctx, const struct ggml_tensor * tensor) {
- struct ggml_tensor * dup = ggml_dup_tensor(ctx, tensor);
- for (int i = 0; i < GGML_MAX_DIMS; i++) {
- dup->nb[i] = tensor->nb[i];
- }
- return dup;
-}
-
-
//#define DEBUG_PASS1
//#define DEBUG_PASS2
//#define DEBUG_PASS3
//#define DEBUG_PASS4
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
-static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
+static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
// reset splits
sched->n_splits = 0;
sched->is_reset = false;
// pass 1: assign backends to ops with pre-allocated inputs
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
- if (node_allocr(leaf) != NULL) {
+ if (tensor_backend_id(leaf) != -1) {
// do not overwrite user assignments
continue;
}
- node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
+ tensor_backend_id(leaf) = ggml_backend_sched_backend_id_from_cur(sched, leaf);
}
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
- if (node_allocr(node) != NULL) {
+ if (tensor_backend_id(node) != -1) {
// do not overwrite user assignments
continue;
}
- node_allocr(node) = sched_allocr_from_cur(sched, node);
+ tensor_backend_id(node) = ggml_backend_sched_backend_id_from_cur(sched, node);
// src
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
break;
}
- if (node_allocr(src) == NULL) {
- node_allocr(src) = sched_allocr_from_cur(sched, src);
+ if (tensor_backend_id(src) == -1) {
+ tensor_backend_id(src) = ggml_backend_sched_backend_id_from_cur(sched, src);
}
}
}
// pass 2.1 expand gpu up
{
- ggml_tallocr_t cur_allocr = NULL;
+ int cur_backend_id = -1;
for (int i = graph->n_nodes - 1; i >= 0; i--) {
struct ggml_tensor * node = graph->nodes[i];
if (ggml_is_view_op(node->op)) {
continue;
}
- ggml_tallocr_t node_allocr = node_allocr(node);
- if (node_allocr != NULL) {
- if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
+ int tensor_backend_id = tensor_backend_id(node);
+ if (tensor_backend_id != -1) {
+ if (tensor_backend_id == sched->n_backends - 1) {
// skip cpu (lowest prio backend)
- cur_allocr = NULL;
+ cur_backend_id = -1;
} else {
- cur_allocr = node_allocr;
+ cur_backend_id = tensor_backend_id;
}
} else {
- node_allocr(node) = cur_allocr;
+ tensor_backend_id(node) = cur_backend_id;
SET_CAUSE(node, "2.1");
}
}
// pass 2.2 expand gpu down
{
- ggml_tallocr_t cur_allocr = NULL;
+ int cur_backend_id = -1;
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
if (ggml_is_view_op(node->op)) {
continue;
}
- ggml_tallocr_t node_allocr = node_allocr(node);
- if (node_allocr != NULL) {
- if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
+ int tensor_backend_id = tensor_backend_id(node);
+ if (tensor_backend_id != -1) {
+ if (tensor_backend_id == sched->n_backends - 1) {
// skip cpu (lowest prio backend)
- cur_allocr = NULL;
+ cur_backend_id = -1;
} else {
- cur_allocr = node_allocr;
+ cur_backend_id = tensor_backend_id;
}
} else {
- node_allocr(node) = cur_allocr;
+ tensor_backend_id(node) = cur_backend_id;
SET_CAUSE(node, "2.2");
}
}
// pass 2.3 expand rest up
{
- ggml_tallocr_t cur_allocr = NULL;
+ int cur_backend_id = -1;
for (int i = graph->n_nodes - 1; i >= 0; i--) {
struct ggml_tensor * node = graph->nodes[i];
if (ggml_is_view_op(node->op)) {
continue;
}
- ggml_tallocr_t node_allocr = node_allocr(node);
- if (node_allocr != NULL) {
- cur_allocr = node_allocr;
+ int tensor_backend_id = tensor_backend_id(node);
+ if (tensor_backend_id != -1) {
+ cur_backend_id = tensor_backend_id;
} else {
- node_allocr(node) = cur_allocr;
+ tensor_backend_id(node) = cur_backend_id;
SET_CAUSE(node, "2.3");
}
}
// pass 2.4 expand rest down
{
- ggml_tallocr_t cur_allocr = NULL;
+ int cur_backend_id = -1;
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
if (ggml_is_view_op(node->op)) {
continue;
}
- ggml_tallocr_t node_allocr = node_allocr(node);
- if (node_allocr != NULL) {
- cur_allocr = node_allocr;
+ int tensor_backend_id = tensor_backend_id(node);
+ if (tensor_backend_id != -1) {
+ cur_backend_id = tensor_backend_id;
} else {
- node_allocr(node) = cur_allocr;
+ tensor_backend_id(node) = cur_backend_id;
SET_CAUSE(node, "2.4");
}
}
// pass 3: assign backends to remaining src from dst and view_src
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
- ggml_tallocr_t cur_allocr = node_allocr(node);
- if (node->view_src != NULL && cur_allocr == NULL) {
- cur_allocr = node_allocr(node) = node_allocr(node->view_src);
+ int cur_backend_id = tensor_backend_id(node);
+ if (node->view_src != NULL && cur_backend_id == -1) {
+ cur_backend_id = tensor_backend_id(node) = tensor_backend_id(node->view_src);
SET_CAUSE(node, "3.vsrc");
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (src == NULL) {
break;
}
- ggml_tallocr_t src_allocr = node_allocr(src);
- if (src_allocr == NULL) {
+ int src_backend_id = tensor_backend_id(src);
+ if (src_backend_id == -1) {
if (src->view_src != NULL) {
// views are always on the same backend as the source
- node_allocr(src) = node_allocr(src->view_src);
+ tensor_backend_id(src) = tensor_backend_id(src->view_src);
SET_CAUSE(src, "3.vsrc");
} else {
- node_allocr(src) = cur_allocr;
+ tensor_backend_id(src) = cur_backend_id;
SET_CAUSE(src, "3.cur");
}
}
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
if (!ggml_is_view_op(node->op)) {
- sched->splits[0].tallocr = node_allocr(node);
+ sched->splits[0].backend_id = tensor_backend_id(node);
break;
}
}
sched->splits[0].i_start = 0;
sched->splits[0].n_inputs = 0;
memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
- ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
- size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
+ int cur_backend_id = sched->splits[0].backend_id;
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
continue;
}
- ggml_tallocr_t node_allocr = node_allocr(node);
+ int tensor_backend_id = tensor_backend_id(node);
- GGML_ASSERT(node_allocr != NULL); // all nodes should be assigned by now
+ GGML_ASSERT(tensor_backend_id != -1); // all nodes should be assigned by now
- if (node_allocr != cur_allocr) {
+ if (tensor_backend_id != cur_backend_id) {
sched->splits[cur_split].i_end = i;
cur_split++;
GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
- sched->splits[cur_split].tallocr = node_allocr;
+ sched->splits[cur_split].backend_id = tensor_backend_id;
sched->splits[cur_split].i_start = i;
sched->splits[cur_split].n_inputs = 0;
- cur_allocr = node_allocr;
- cur_backend_id = sched_allocr_prio(sched, cur_allocr);
+ cur_backend_id = tensor_backend_id;
}
// find inputs that are not on the same backend
if (src == NULL) {
break;
}
- ggml_tallocr_t src_allocr = node_allocr(src);
- GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
- if (src_allocr != node_allocr) {
+ int src_backend_id = tensor_backend_id(src);
+ assert(src_backend_id != -1); // all inputs should be assigned by now
+ if (src_backend_id != tensor_backend_id) {
// create a copy of the input in the split's backend
size_t id = hash_id(src);
- if (sched->node_copies[id][cur_backend_id] == NULL) {
- ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
+ if (sched->tensor_copies[id][cur_backend_id] == NULL) {
+ ggml_backend_t backend = sched->backends[cur_backend_id];
struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
- sched->node_copies[id][cur_backend_id] = tensor_copy;
- node_allocr(tensor_copy) = cur_allocr;
+ sched->tensor_copies[id][cur_backend_id] = tensor_copy;
+ tensor_backend_id(tensor_copy) = cur_backend_id;
SET_CAUSE(tensor_copy, "4.cpy");
int n_inputs = sched->splits[cur_split].n_inputs++;
GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
sched->splits[cur_split].inputs[n_inputs] = src;
}
- node->src[j] = sched->node_copies[id][cur_backend_id];
-
-#if 0
- // check if the input is already in the split
- bool found = false;
- for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
- if (sched->splits[cur_split].inputs[k] == src) {
- found = true;
- break;
- }
- }
-
- if (!found) {
- int n_inputs = sched->splits[cur_split].n_inputs++;
- //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
- GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
- sched->splits[cur_split].inputs[n_inputs] = src;
- }
-#endif
+ node->src[j] = sched->tensor_copies[id][cur_backend_id];
}
}
}
// sanity check: all sources should have the same backend as the node
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
- ggml_tallocr_t node_allocr = node_allocr(node);
- if (node_allocr == NULL) {
+ ggml_backend_t tensor_backend = tensor_backend(node);
+ if (tensor_backend == NULL) {
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
}
- if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
+ if (node->view_src != NULL && tensor_backend != tensor_backend(node->view_src)) {
fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
- node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
- node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
+ node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
+ node->view_src->name, tensor_backend(node->view_src) ? ggml_backend_name(tensor_backend(node->view_src)) : "NULL");
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
break;
}
- ggml_tallocr_t src_allocr = node_allocr(src);
- if (src_allocr != node_allocr /* && src_backend != NULL */) { // ignore nulls for now
+ ggml_backend_t src_backend = tensor_backend(src);
+ if (src_backend != tensor_backend /* && src_backend != NULL */) {
fprintf(stderr, "!!!! %s has backend %s, src %d (%s) has backend %s\n",
- node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
- j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
+ node->name, tensor_backend ? ggml_backend_name(tensor_backend) : "NULL",
+ j, src->name, src_backend ? ggml_backend_name(src_backend) : "NULL");
}
- if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
+ if (src->view_src != NULL && src_backend != tensor_backend(src->view_src)) {
fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
- src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
- src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
+ src->name, src_backend ? ggml_backend_name(src_backend) : "NULL",
+ src->view_src->name, tensor_backend(src->view_src) ? ggml_backend_name(tensor_backend(src->view_src)) : "NULL");
}
}
}
struct ggml_backend_sched_split * split = &sched->splits[i];
split->graph = ggml_graph_view(graph, split->i_start, split->i_end);
- // add inputs to the graph copy so that they are allocated by ggml-alloc at the start of the split
for (int j = 0; j < split->n_inputs; j++) {
struct ggml_tensor * input = split->inputs[j];
- struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split->backend_id];
+
// add a dependency to the input source so that it is not freed before the copy is done
- GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
- input_cpy->src[0] = input;
+ struct ggml_tensor * input_dep = ggml_view_tensor(sched->ctx, input);
+ sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(input);
+ graph_copy->nodes[graph_copy->n_nodes++] = input_dep;
+
+ // add a dependency to the input copy so that it is allocated at the start of the split
+ sched->node_backend_ids[graph_copy->n_nodes] = split->backend_id;
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
}
for (int j = split->i_start; j < split->i_end; j++) {
+ sched->node_backend_ids[graph_copy->n_nodes] = tensor_backend_id(graph->nodes[j]);
graph_copy->nodes[graph_copy->n_nodes++] = graph->nodes[j];
}
}
sched->graph = graph_copy;
}
-static void sched_alloc_splits(ggml_backend_sched_t sched) {
- ggml_gallocr_alloc_graph_n(
- sched->galloc,
- sched->graph,
- sched->hash_set,
- sched->node_talloc);
+static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
+ // ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
+ if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
+#ifndef NDEBUG
+ fprintf(stderr, "ggml_backend_sched: failed to allocate graph, reserving\n");
+#endif
+ ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids);
+ if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
+ fprintf(stderr, "ggml_backend_sched: failed to allocate graph\n");
+ return false;
+ }
+ }
}
-static void sched_compute_splits(ggml_backend_sched_t sched) {
+static bool ggml_backend_sched_compute_splits(ggml_backend_sched_t sched) {
uint64_t copy_us[GGML_MAX_BACKENDS] = {0};
uint64_t compute_us[GGML_MAX_BACKENDS] = {0};
for (int i = 0; i < sched->n_splits; i++) {
struct ggml_backend_sched_split * split = &splits[i];
- ggml_backend_t split_backend = get_allocr_backend(sched, split->tallocr);
- int split_backend_id = sched_backend_prio(sched, split_backend);
+ int split_backend_id = split->backend_id;
+ ggml_backend_t split_backend = sched->backends[split_backend_id];
// copy the input tensors to the split backend
uint64_t copy_start_us = ggml_time_us();
for (int j = 0; j < split->n_inputs; j++) {
struct ggml_tensor * input = split->inputs[j];
- struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
+ struct ggml_tensor * input_cpy = sched->tensor_copies[hash_id(input)][split_backend_id];
GGML_ASSERT(input->buffer != NULL);
GGML_ASSERT(input_cpy->buffer != NULL);
- // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
- // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
}
//ggml_backend_synchronize(split_backend); // necessary to measure copy time
uint64_t compute_start_us = ggml_time_us();
if (!sched->callback_eval) {
- ggml_backend_graph_compute(split_backend, &split->graph);
+ if (!ggml_backend_graph_compute(split_backend, &split->graph)) {
+ return false;
+ }
//ggml_backend_synchronize(split_backend); // necessary to measure compute time
} else {
// similar to ggml_backend_compare_graph_backend
struct ggml_cgraph gv = ggml_graph_view(&split->graph, j0, j1 + 1);
- ggml_backend_graph_compute(split_backend, &gv);
+ if (!ggml_backend_graph_compute(split_backend, &gv)) {
+ return false;
+ }
if (need && !sched->callback_eval(t, false, sched->callback_eval_user_data)) {
break;
}
}
#endif
-}
-
-static void sched_reset(ggml_backend_sched_t sched) {
- for (int i = 0; i < sched->n_backends; i++) {
- ggml_tallocr_reset(sched->tallocs[i]);
- }
- // reset state for the next run
- size_t hash_size = sched->hash_set.size;
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
- memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
- memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
- sched->is_reset = true;
+ return true;
}
ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
// initialize hash table
- sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
- sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
- sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
+ sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
+ sched->tensor_backend_id = calloc(sizeof(sched->tensor_backend_id[0]), sched->hash_set.size);
+ sched->tensor_copies = calloc(sizeof(sched->tensor_copies[0]), sched->hash_set.size);
+ sched->node_backend_ids = calloc(sizeof(sched->node_backend_ids[0]), graph_size);
sched->n_backends = n_backends;
for (int i = 0; i < n_backends; i++) {
sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
}
- sched->galloc = ggml_gallocr_new();
+ sched->galloc = ggml_gallocr_new_n(sched->bufts, n_backends);
- // init measure allocs for each backend
- for (int i = 0; i < n_backends; i++) {
- sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
- }
-
- sched_reset(sched);
+ ggml_backend_sched_reset(sched);
return sched;
}
if (sched == NULL) {
return;
}
- for (int i = 0; i < sched->n_backends; i++) {
- ggml_tallocr_free(sched->tallocs[i]);
- }
ggml_gallocr_free(sched->galloc);
ggml_free(sched->ctx);
free(sched->hash_set.keys);
- free(sched->node_talloc);
- free(sched->node_copies);
+ free(sched->tensor_backend_id);
+ free(sched->tensor_copies);
+ free(sched->node_backend_ids);
free(sched);
}
-void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
- GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
+void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
+ // reset state for the next run
+ size_t hash_size = sched->hash_set.size;
+ memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size); // NOLINT
+ memset(sched->tensor_backend_id, -1, sizeof(sched->tensor_backend_id[0]) * hash_size);
+ memset(sched->tensor_copies, 0, sizeof(sched->tensor_copies[0]) * hash_size);
- sched_split_graph(sched, measure_graph);
- sched_alloc_splits(sched);
+ sched->is_reset = true;
+}
- // allocate buffers and reset allocators
- for (int i = 0; i < sched->n_backends; i++) {
- size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
- ggml_tallocr_free(sched->tallocs[i]);
- sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
+bool ggml_backend_sched_reserve(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
+ ggml_backend_sched_split_graph(sched, measure_graph);
+
+ if (!ggml_gallocr_reserve_n(sched->galloc, sched->graph, sched->node_backend_ids)) {
+ return false;
}
- sched_reset(sched);
+ ggml_backend_sched_reset(sched);
+ return true;
}
-void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
+bool ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
if (!sched->is_reset) {
- sched_reset(sched);
+ ggml_backend_sched_reset(sched);
}
- sched_split_graph(sched, graph);
- sched_alloc_splits(sched);
- sched_compute_splits(sched);
-}
+ ggml_backend_sched_split_graph(sched, graph);
+ if (!ggml_backend_sched_alloc_splits(sched)) {
+ return false;
+ }
-void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
- sched_reset(sched);
-}
+ if (!ggml_backend_sched_compute_splits(sched)) {
+ return false;
+ }
+ return true;
+}
void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data) {
sched->callback_eval = callback;
return sched->n_splits;
}
-ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
- int backend_index = sched_backend_prio(sched, backend);
- GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
- return sched->tallocs[backend_index];
-}
-
-ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
- int backend_index = sched_backend_prio(sched, backend);
+size_t ggml_backend_sched_get_buffer_size(ggml_backend_sched_t sched, ggml_backend_t backend) {
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
- return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
+ return ggml_gallocr_get_buffer_size(sched->galloc, backend_index);
}
void ggml_backend_sched_set_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node, ggml_backend_t backend) {
- int backend_index = sched_backend_prio(sched, backend);
+ int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
- node_allocr(node) = sched->tallocs[backend_index];
+ tensor_backend_id(node) = backend_index;
}
ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
- ggml_tallocr_t allocr = node_allocr(node);
- if (allocr == NULL) {
+ int backend_index = tensor_backend_id(node);
+ if (backend_index == -1) {
return NULL;
}
- return get_allocr_backend(sched, allocr);
+ return sched->backends[backend_index];
}
// utils
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
GGML_ASSERT(tensor->buffer == NULL);
- //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
GGML_ASSERT(tensor->view_src != NULL);
GGML_ASSERT(tensor->view_src->buffer != NULL);
GGML_ASSERT(tensor->view_src->data != NULL);
ggml_backend_buffer_init_tensor(buffer, tensor);
}
-static struct ggml_tensor * graph_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
+static struct ggml_tensor * graph_copy_dup_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies,
struct ggml_context * ctx_allocated, struct ggml_context * ctx_unallocated, struct ggml_tensor * src) {
GGML_ASSERT(src != NULL);
struct ggml_tensor * dst = ggml_dup_tensor_layout(src->data && !src->view_src ? ctx_allocated : ctx_unallocated, src);
if (src->view_src != NULL) {
- dst->view_src = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
+ dst->view_src = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, src->view_src);
dst->view_offs = src->view_offs;
}
dst->op = src->op;
if (s == NULL) {
break;
}
- dst->src[i] = graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
+ dst->src[i] = graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, s);
}
node_copies[id] = dst;
return dst;
}
-static void graph_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
+static void graph_copy_init_tensor(struct ggml_hash_set hash_set, struct ggml_tensor ** node_copies, bool * node_init, struct ggml_tensor * src) {
size_t id = ggml_hash_find(hash_set, src);
if (node_init[id]) {
return;
struct ggml_tensor * dst = node_copies[id];
if (dst->view_src != NULL) {
- graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
+ graph_copy_init_tensor(hash_set, node_copies, node_init, src->view_src);
ggml_backend_view_init(dst->view_src->buffer, dst);
}
else {
if (s == NULL) {
break;
}
- graph_init_tensor(hash_set, node_copies, node_init, s);
+ graph_copy_init_tensor(hash_set, node_copies, node_init, s);
}
}
struct ggml_backend_graph_copy ggml_backend_graph_copy(ggml_backend_t backend, struct ggml_cgraph * graph) {
struct ggml_hash_set hash_set = {
/* .size = */ graph->visited_hash_table.size,
- /* .keys = */ calloc(sizeof(hash_set.keys[0]) * graph->visited_hash_table.size, 1)
+ /* .keys = */ calloc(sizeof(hash_set.keys[0]), graph->visited_hash_table.size) // NOLINT
};
- struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]) * hash_set.size, 1);
- bool * node_init = calloc(sizeof(node_init[0]) * hash_set.size, 1);
+ struct ggml_tensor ** node_copies = calloc(sizeof(node_copies[0]), hash_set.size); // NOLINT
+ bool * node_init = calloc(sizeof(node_init[0]), hash_set.size);
struct ggml_init_params params = {
/* .mem_size = */ ggml_tensor_overhead()*hash_set.size + ggml_graph_overhead_custom(graph->size, false),
// dup nodes
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
- graph_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
+ graph_copy_dup_tensor(hash_set, node_copies, ctx_allocated, ctx_unallocated, node);
}
// allocate nodes
// copy data and init views
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
- graph_init_tensor(hash_set, node_copies, node_init, node);
+ graph_copy_init_tensor(hash_set, node_copies, node_init, node);
}
// build graph copy
/*.nb =*/ { 0, 0, 0, 0 },
/*.op =*/ GGML_OP_NONE,
/*.op_params =*/ { 0 },
- /*.is_param =*/ false,
+ /*.flags =*/ 0,
/*.grad =*/ NULL,
/*.src =*/ { NULL },
/*.perf_runs =*/ 0,
void ggml_set_param(
struct ggml_context * ctx,
struct ggml_tensor * tensor) {
- tensor->is_param = true;
+ tensor->flags |= GGML_TENSOR_FLAG_PARAM;
GGML_ASSERT(tensor->grad == NULL);
tensor->grad = ggml_dup_tensor(ctx, tensor);
return NULL;
}
- if (node->is_param) {
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
return node;
}
clone->op = node->op;
clone->grad = node->grad;
- clone->is_param = node->is_param;
+ clone->flags = node->flags;
clone->extra = node->extra;
for (int k = 0; k < GGML_MAX_DIMS; ++k) {
clone->nb[k] = node->nb[k];
for (int i = 0; i < gf->n_nodes; i++) {
struct ggml_tensor * node = gf->nodes[i];
- if (node->is_param) {
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
GGML_PRINT_DEBUG("%s: found root node %p\n", __func__, (void *) node);
ggml_build_forward_expand(gb, node->grad);
}
GGML_PRINT(" - %3d: [ %5" PRId64 ", %5" PRId64 ", %5" PRId64 "] %16s %s (%3d) cpu = %7.3f / %7.3f ms, wall = %7.3f / %7.3f ms\n",
i,
node->ne[0], node->ne[1], node->ne[2],
- ggml_op_name(node->op), node->is_param ? "x" : node->grad ? "g" : " ", node->perf_runs,
+ ggml_op_name(node->op), (node->flags & GGML_TENSOR_FLAG_PARAM) ? "x" : node->grad ? "g" : " ", node->perf_runs,
(double) node->perf_cycles / (double) ggml_cycles_per_ms(),
(double) node->perf_cycles / (double) ggml_cycles_per_ms() / (double) node->perf_runs,
(double) node->perf_time_us / 1000.0,
continue;
}
- if (node->is_param) {
+ if (node->flags & GGML_TENSOR_FLAG_PARAM) {
snprintf(color, sizeof(color), "yellow");
} else if (node->grad) {
if (ggml_graph_find(gf, node)) {
int np = 0;
int64_t nx = 0;
for (int i = 0; i < gf->n_nodes; ++i) {
- if (gf->nodes[i]->is_param) {
+ if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
GGML_ASSERT(np < GGML_MAX_PARAMS);
int np = 0;
int nx = 0;
for (int i = 0; i < gf->n_nodes; ++i) {
- if (gf->nodes[i]->is_param) {
+ if (gf->nodes[i]->flags & GGML_TENSOR_FLAG_PARAM) {
GGML_PRINT_DEBUG("found param %d: grad->op = %d\n", np, gf->nodes[i]->grad->op);
GGML_ASSERT(np < GGML_MAX_PARAMS);
////////////////////////////////////////////////////////////////////////////////
+void ggml_set_input(struct ggml_tensor * tensor) {
+ tensor->flags |= GGML_TENSOR_FLAG_INPUT;
+}
+
+void ggml_set_output(struct ggml_tensor * tensor) {
+ tensor->flags |= GGML_TENSOR_FLAG_OUTPUT;
+}
+
+////////////////////////////////////////////////////////////////////////////////
+
void ggml_quantize_init(enum ggml_type type) {
ggml_critical_section_start();
GGML_ASSERT(ggml_backend_buffer_get_alloc_size(buffer, tensor) >= n * sizeof(float));
- ggml_tallocr_t allocr = ggml_tallocr_new_from_buffer(buffer);
+ ggml_tallocr_t allocr = ggml_tallocr_new(buffer);
ggml_tallocr_alloc(allocr, tensor);
GGML_ASSERT(tensor->data != NULL);
model.b = ggml_new_tensor_3d(model.ctx, GGML_TYPE_F32, IL, IC, N);
// create a allocator
- ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer);
+ ggml_tallocr_t alloc = ggml_tallocr_new(model.buffer);
// alloc memory
- ggml_allocr_alloc(alloc, model.a);
+ ggml_tallocr_alloc(alloc, model.a);
// load data to buffer
if(ggml_backend_is_cpu(model.backend)) {
}
// alloc memory
- ggml_allocr_alloc(alloc, model.b);
+ ggml_tallocr_alloc(alloc, model.b);
if(ggml_backend_is_cpu(model.backend)
#ifdef GGML_USE_METAL
ggml_backend_tensor_set(model.b, bdata, 0, ggml_nbytes(model.b));
}
- ggml_allocr_free(alloc);
+ ggml_tallocr_free(alloc);
}
-struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * allocr) {
+struct ggml_cgraph * build_graph(const test_model& model) {
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params0 = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
- /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+ /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
};
// create a temporally context to build the graph
return gf;
}
-struct ggml_cgraph* compute_graph(const test_model & model, struct ggml_allocr * allocr) {
- // reset the allocator to free all the memory allocated during the previous inference
- ggml_allocr_reset(allocr);
-
- struct ggml_cgraph * gf = build_graph(model, allocr);
+struct ggml_cgraph* compute_graph(const test_model & model, ggml_gallocr_t allocr) {
+ struct ggml_cgraph * gf = build_graph(model);
// allocate tensors
- ggml_allocr_alloc_graph(allocr, gf);
+ ggml_gallocr_alloc_graph(allocr, gf);
int n_threads = 1;
if (ggml_backend_is_cpu(model.backend)) {
test_model model;
load_model(model, true);
- ggml_backend_buffer_t buf_compute; // for compute
- struct ggml_allocr * allocr = NULL;
+ ggml_gallocr_t allocr = NULL;
{
- allocr = ggml_allocr_new_measure_from_backend(model.backend);
+ allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
//create the worst case graph for memory usage estimation
- struct ggml_cgraph * gf = build_graph(model, allocr);
- size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
- ggml_allocr_free(allocr);
+ struct ggml_cgraph * gf = build_graph(model);
// compute the required memory
- buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
- allocr = ggml_allocr_new_from_buffer(buf_compute);
+ ggml_gallocr_reserve(allocr, gf);
+ size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);
fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f);
}
ggml_free(model.ctx);
ggml_backend_buffer_free(model.buffer);
- ggml_backend_buffer_free(buf_compute);
ggml_backend_free(model.backend);
+ ggml_gallocr_free(allocr);
return 0;
}
model.b = ggml_new_tensor_4d(model.ctx, GGML_TYPE_F32, IW, IH, IC, N);
// create a allocator
- ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer);
+ ggml_tallocr_t alloc = ggml_tallocr_new(model.buffer);
// alloc memory
- ggml_allocr_alloc(alloc, model.a);
+ ggml_tallocr_alloc(alloc, model.a);
// load data to buffer
if(ggml_backend_is_cpu(model.backend)) {
}
// alloc memory
- ggml_allocr_alloc(alloc, model.b);
+ ggml_tallocr_alloc(alloc, model.b);
if(ggml_backend_is_cpu(model.backend)
#ifdef GGML_USE_METAL
ggml_backend_tensor_set(model.b, bdata, 0, ggml_nbytes(model.b));
}
- ggml_allocr_free(alloc);
+ ggml_tallocr_free(alloc);
}
-struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * allocr) {
+struct ggml_cgraph * build_graph(const test_model& model) {
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params0 = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
- /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+ /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
};
// create a temporally context to build the graph
return gf;
}
-struct ggml_cgraph * compute_graph(const test_model & model, struct ggml_allocr * allocr) {
- // reset the allocator to free all the memory allocated during the previous inference
- ggml_allocr_reset(allocr);
-
- struct ggml_cgraph * gf = build_graph(model, allocr);
+struct ggml_cgraph * compute_graph(const test_model & model, ggml_gallocr_t allocr) {
+ struct ggml_cgraph * gf = build_graph(model);
// allocate tensors
- ggml_allocr_alloc_graph(allocr, gf);
+ ggml_gallocr_alloc_graph(allocr, gf);
int n_threads = 1;
if (ggml_backend_is_cpu(model.backend)) {
test_model model;
load_model(model, true);
- ggml_backend_buffer_t buf_compute; // for compute
- struct ggml_allocr * allocr = NULL;
+ ggml_gallocr_t allocr = NULL;
{
- allocr = ggml_allocr_new_measure_from_backend(model.backend);
+ allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
//create the worst case graph for memory usage estimation
- struct ggml_cgraph * gf = build_graph(model, allocr);
- size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
- ggml_allocr_free(allocr);
+ struct ggml_cgraph * gf = build_graph(model);
// compute the required memory
- buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
- allocr = ggml_allocr_new_from_buffer(buf_compute);
+ ggml_gallocr_reserve(allocr, gf);
+ size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);
fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f);
}
ggml_free(model.ctx);
ggml_backend_buffer_free(model.buffer);
- ggml_backend_buffer_free(buf_compute);
ggml_backend_free(model.backend);
+ ggml_gallocr_free(allocr);
return 0;
}
#include <string>
#include <vector>
-static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
- (void) level;
- (void) user_data;
- fputs(text, stderr);
- fflush(stderr);
-}
-
struct test_model {
struct ggml_tensor * a;
struct ggml_tensor * b;
#ifdef GGML_USE_METAL
if (use_gpu) {
fprintf(stderr, "%s: using Metal backend\n", __func__);
- ggml_backend_metal_log_set_callback(ggml_log_callback_default, nullptr);
model.backend = ggml_backend_metal_init();
if (!model.backend) {
fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
printf("Matrix B: [%i, %i]\n", K, N);
// create a allocator
- ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer);
+ ggml_tallocr_t alloc = ggml_tallocr_new(model.buffer);
// alloc memory
- ggml_allocr_alloc(alloc, model.a);
+ ggml_tallocr_alloc(alloc, model.a);
// load data to buffer
if(ggml_backend_is_cpu(model.backend)
}
// alloc memory
- ggml_allocr_alloc(alloc, model.b);
+ ggml_tallocr_alloc(alloc, model.b);
if(ggml_backend_is_cpu(model.backend)
#ifdef GGML_USE_METAL
ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b)); // cuda requires copy the data directly to device
}
- ggml_allocr_free(alloc);
+ ggml_tallocr_free(alloc);
}
-struct ggml_cgraph * build_graph(const test_model& model, struct ggml_allocr * allocr) {
+struct ggml_cgraph * build_graph(const test_model& model) {
static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
static std::vector<uint8_t> buf(buf_size);
struct ggml_init_params params0 = {
/*.mem_size =*/ buf_size,
/*.mem_buffer =*/ buf.data(),
- /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph()
+ /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
};
// create a temporally context to build the graph
return gf;
}
-struct ggml_tensor* compute(const test_model & model, struct ggml_allocr * allocr) {
- // reset the allocator to free all the memory allocated during the previous inference
- ggml_allocr_reset(allocr);
-
- struct ggml_cgraph * gf = build_graph(model, allocr);
+struct ggml_tensor* compute(const test_model & model, ggml_gallocr_t allocr) {
+ struct ggml_cgraph * gf = build_graph(model);
// allocate tensors
- ggml_allocr_alloc_graph(allocr, gf);
+ ggml_gallocr_alloc_graph(allocr, gf);
int n_threads = 1;
if (ggml_backend_is_cpu(model.backend)) {
test_model model;
load_model(model, matrixA, matrixB, M, N, K, true);
- ggml_backend_buffer_t buf_compute; // for compute
- struct ggml_allocr * allocr = NULL;
+ ggml_gallocr_t allocr = NULL;
{
- allocr = ggml_allocr_new_measure_from_backend(model.backend);
+ allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
//create the worst case graph for memory usage estimation
- struct ggml_cgraph * gf = build_graph(model, allocr);
- size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
- ggml_allocr_free(allocr);
+ struct ggml_cgraph * gf = build_graph(model);
// compute the required memory
- buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
- allocr = ggml_allocr_new_from_buffer(buf_compute);
- fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0);
+ ggml_gallocr_reserve(allocr, gf);
+ size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);
+ fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0f/1024.0f);
}
struct ggml_tensor * result = compute(model, allocr);
ggml_free(model.ctx);
ggml_backend_buffer_free(model.buffer);
- ggml_backend_buffer_free(buf_compute);
ggml_backend_free(model.backend);
+ ggml_gallocr_free(allocr);
return 0;
}