/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
};
- struct ggml_context * ctx0 = ggml_init(params);
+ struct ggml_context * ctx = ggml_init(params);
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+ struct ggml_cgraph * gf = ggml_new_graph(ctx);
- struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+ struct ggml_tensor * embd = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
// at this point, the tensor data is not allocated yet and cannot be set
// we will find the tensor after the graph is allocated by its name, and set the data then
ggml_set_name(embd, "embd");
// this is important to ensure that the input tensors are not overwritten before they are used
ggml_set_input(embd);
- struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+ struct ggml_tensor * position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
ggml_set_name(position, "position");
ggml_set_input(position);
// wte + wpe
struct ggml_tensor * inpL =
- ggml_add(ctx0,
- ggml_get_rows(ctx0, model.wte, embd),
- ggml_get_rows(ctx0, model.wpe, position));
+ ggml_add(ctx,
+ ggml_get_rows(ctx, model.wte, embd),
+ ggml_get_rows(ctx, model.wpe, position));
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur;
// norm
{
// [ 768, N]
- cur = ggml_norm(ctx0, inpL, hparams.eps);
+ cur = ggml_norm(ctx, inpL, hparams.eps);
// cur = ln_1_g*cur + ln_1_b
// [ 768, N]
- cur = ggml_add(ctx0,
- ggml_mul(ctx0,
- ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
+ cur = ggml_add(ctx,
+ ggml_mul(ctx,
+ ggml_repeat(ctx, model.layers[il].ln_1_g, cur),
cur),
- ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+ ggml_repeat(ctx, model.layers[il].ln_1_b, cur));
}
// attn
// cur = attn_w*cur + attn_b
// [2304, N]
{
- cur = ggml_mul_mat(ctx0,
+ cur = ggml_mul_mat(ctx,
model.layers[il].c_attn_attn_w,
cur);
- cur = ggml_add(ctx0,
- ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
+ cur = ggml_add(ctx,
+ ggml_repeat(ctx, model.layers[il].c_attn_attn_b, cur),
cur);
}
// self-attention
{
- struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
- struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
- struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+ struct ggml_tensor * Qcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+ struct ggml_tensor * Kcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+ struct ggml_tensor * Vcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
// store key and value to memory
if (N >= 1) {
- struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
- struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+ struct ggml_tensor * k = ggml_view_1d(ctx, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+ struct ggml_tensor * v = ggml_view_1d(ctx, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx, Kcur, k));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx, Vcur, v));
}
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
// [64, N, 12]
struct ggml_tensor * Q =
- ggml_permute(ctx0,
- ggml_cpy(ctx0,
- Qcur,
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+ ggml_permute(ctx,
+ ggml_cont_3d(ctx, Qcur, n_embd/n_head, n_head, N),
0, 2, 1, 3);
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
// [64, n_past + N, 12]
struct ggml_tensor * K =
- ggml_permute(ctx0,
- ggml_reshape_3d(ctx0,
- ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+ ggml_permute(ctx,
+ ggml_reshape_3d(ctx,
+ ggml_view_1d(ctx, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
n_embd/n_head, n_head, n_past + N),
0, 2, 1, 3);
// K * Q
// [n_past + N, N, 12]
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx, K, Q);
// KQ_scaled = KQ / sqrt(n_embd/n_head)
// [n_past + N, N, 12]
struct ggml_tensor * KQ_scaled =
- ggml_scale(ctx0,
+ ggml_scale(ctx,
KQ,
1.0f/sqrtf(float(n_embd)/n_head));
// KQ_masked = mask_past(KQ_scaled)
// [n_past + N, N, 12]
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx, KQ_scaled, n_past);
// KQ = soft_max(KQ_masked)
// [n_past + N, N, 12]
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx, KQ_masked);
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
// [n_past + N, 64, 12]
struct ggml_tensor * V_trans =
- ggml_cpy(ctx0,
- ggml_permute(ctx0,
- ggml_reshape_3d(ctx0,
- ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+ ggml_cont_3d(ctx,
+ ggml_permute(ctx,
+ ggml_reshape_3d(ctx,
+ ggml_view_1d(ctx, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3),
- ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
+ n_past + N, n_embd/n_head, n_head);
// KQV = transpose(V) * KQ_soft_max
// [64, N, 12]
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx, V_trans, KQ_soft_max);
// KQV_merged = KQV.permute(0, 2, 1, 3)
// [64, 12, N]
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx, KQV, 0, 2, 1, 3);
// cur = KQV_merged.contiguous().view(n_embd, N)
// [768, N]
- cur = ggml_cpy(ctx0,
- KQV_merged,
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+ cur = ggml_cont_2d(ctx, KQV_merged, n_embd, N);
}
// projection
// cur = proj_w*cur + proj_b
// [768, N]
{
- cur = ggml_mul_mat(ctx0,
+ cur = ggml_mul_mat(ctx,
model.layers[il].c_attn_proj_w,
cur);
- cur = ggml_add(ctx0,
- ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
+ cur = ggml_add(ctx,
+ ggml_repeat(ctx, model.layers[il].c_attn_proj_b, cur),
cur);
}
// add the input
- cur = ggml_add(ctx0, cur, inpL);
+ cur = ggml_add(ctx, cur, inpL);
struct ggml_tensor * inpFF = cur;
{
// norm
{
- cur = ggml_norm(ctx0, inpFF, hparams.eps);
+ cur = ggml_norm(ctx, inpFF, hparams.eps);
// cur = ln_2_g*cur + ln_2_b
// [ 768, N]
- cur = ggml_add(ctx0,
- ggml_mul(ctx0,
- ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
+ cur = ggml_add(ctx,
+ ggml_mul(ctx,
+ ggml_repeat(ctx, model.layers[il].ln_2_g, cur),
cur),
- ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
+ ggml_repeat(ctx, model.layers[il].ln_2_b, cur));
}
// fully connected
//
// cur = fc_w*cur + fc_b
// [3072, N]
- cur = ggml_mul_mat(ctx0,
+ cur = ggml_mul_mat(ctx,
model.layers[il].c_mlp_fc_w,
cur);
- cur = ggml_add(ctx0,
- ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
+ cur = ggml_add(ctx,
+ ggml_repeat(ctx, model.layers[il].c_mlp_fc_b, cur),
cur);
// GELU activation
// [3072, N]
- cur = ggml_gelu(ctx0, cur);
+ cur = ggml_gelu(ctx, cur);
// projection
// [ 768, 3072] - model.layers[il].c_mlp_proj_w
//
// cur = proj_w*cur + proj_b
// [768, N]
- cur = ggml_mul_mat(ctx0,
+ cur = ggml_mul_mat(ctx,
model.layers[il].c_mlp_proj_w,
cur);
- cur = ggml_add(ctx0,
- ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
+ cur = ggml_add(ctx,
+ ggml_repeat(ctx, model.layers[il].c_mlp_proj_b, cur),
cur);
}
// input for next layer
- inpL = ggml_add(ctx0, cur, inpFF);
+ inpL = ggml_add(ctx, cur, inpFF);
}
// norm
{
// [ 768, N]
- inpL = ggml_norm(ctx0, inpL, hparams.eps);
+ inpL = ggml_norm(ctx, inpL, hparams.eps);
// inpL = ln_f_g*inpL + ln_f_b
// [ 768, N]
- inpL = ggml_add(ctx0,
- ggml_mul(ctx0,
- ggml_repeat(ctx0, model.ln_f_g, inpL),
+ inpL = ggml_add(ctx,
+ ggml_mul(ctx,
+ ggml_repeat(ctx, model.ln_f_g, inpL),
inpL),
- ggml_repeat(ctx0, model.ln_f_b, inpL));
+ ggml_repeat(ctx, model.ln_f_b, inpL));
}
// inpL = WTE * inpL
// [ 768, 50257] - model.lm_head
// [ 768, N] - inpL
- inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+ inpL = ggml_mul_mat(ctx, model.lm_head, inpL);
ggml_set_name(inpL, "logits");
// setting a tensor as the output will ensure that it is not overwritten by subsequent operations
ggml_set_output(inpL);
ggml_build_forward_expand(gf, inpL);
- ggml_free(ctx0);
+ ggml_free(ctx);
return gf;
}
// [64, N, 12]
struct ggml_tensor * Q =
ggml_permute(ctx,
- ggml_cpy(ctx,
- Qcur,
- ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+ ggml_cont_3d(ctx, Qcur, n_embd/n_head, n_head, N),
0, 2, 1, 3);
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
// [n_past + N, 64, 12]
struct ggml_tensor * V_trans =
- ggml_cpy(ctx,
+ ggml_cont_3d(ctx,
ggml_permute(ctx,
ggml_reshape_3d(ctx,
ggml_view_1d(ctx, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3),
- ggml_new_tensor_3d(ctx, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
+ n_past + N, n_embd/n_head, n_head);
// KQV = transpose(V) * KQ_soft_max
// [64, N, 12]
// cur = KQV_merged.contiguous().view(n_embd, N)
// [768, N]
- cur = ggml_cpy(ctx,
- KQV_merged,
- ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, N));
+ cur = ggml_cont_2d(ctx, KQV_merged, n_embd, N);
}
// projection
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
};
- struct ggml_context * ctx0 = ggml_init(params);
+ struct ggml_context * ctx = ggml_init(params);
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, GPT2_MAX_NODES, false);
struct ggml_tensor * inpL;
if (batch.token) {
- struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
ggml_set_name(inp_tokens, "inp_tokens");
ggml_set_input(inp_tokens);
- struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+ struct ggml_tensor * position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
ggml_set_name(position, "position");
ggml_set_input(position);
// wte + wpe
inpL =
- ggml_add(ctx0,
- ggml_get_rows(ctx0, model.wte, inp_tokens),
- ggml_get_rows(ctx0, model.wpe, position));
+ ggml_add(ctx,
+ ggml_get_rows(ctx, model.wte, inp_tokens),
+ ggml_get_rows(ctx, model.wpe, position));
} else {
GGML_ASSERT(batch.embd);
- inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
+ inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
ggml_set_name(inpL, "embd");
ggml_set_input(inpL);
}
// KQ_mask (mask for 1 head, it will be broadcasted to all heads)
- struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+ struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_kv, n_tokens, 1);
ggml_set_name(KQ_mask, "KQ_mask");
ggml_set_input(KQ_mask);
// norm
{
// [ 768, N]
- cur = ggml_norm(ctx0, inpL, hparams.eps);
+ cur = ggml_norm(ctx, inpL, hparams.eps);
// cur = ln_1_g*cur + ln_1_b
// [ 768, N]
- cur = ggml_add(ctx0,
- ggml_mul(ctx0,
+ cur = ggml_add(ctx,
+ ggml_mul(ctx,
cur,
model.layers[il].ln_1_g),
model.layers[il].ln_1_b);
// cur = attn_w*cur + attn_b
// [2304, n_tokens]
{
- cur = ggml_mul_mat(ctx0,
+ cur = ggml_mul_mat(ctx,
model.layers[il].c_attn_attn_w,
cur);
- cur = ggml_add(ctx0,
+ cur = ggml_add(ctx,
cur,
model.layers[il].c_attn_attn_b);
}
// self-attention
{
- struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
- struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*n_embd);
- struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*n_embd);
+ struct ggml_tensor * Qcur = ggml_view_2d(ctx, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
+ struct ggml_tensor * Kcur = ggml_view_2d(ctx, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*n_embd);
+ struct ggml_tensor * Vcur = ggml_view_2d(ctx, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*n_embd);
// store key and value to memory
if (n_tokens >= 1) {
- struct ggml_tensor * k = ggml_view_1d(ctx0, model.kv_cache.k, n_tokens*n_embd, (ggml_element_size(model.kv_cache.k)*n_embd)*(il*n_ctx + kv_head));
- struct ggml_tensor * v = ggml_view_1d(ctx0, model.kv_cache.v, n_tokens*n_embd, (ggml_element_size(model.kv_cache.v)*n_embd)*(il*n_ctx + kv_head));
+ struct ggml_tensor * k = ggml_view_1d(ctx, model.kv_cache.k, n_tokens*n_embd, (ggml_element_size(model.kv_cache.k)*n_embd)*(il*n_ctx + kv_head));
+ struct ggml_tensor * v = ggml_view_1d(ctx, model.kv_cache.v, n_tokens*n_embd, (ggml_element_size(model.kv_cache.v)*n_embd)*(il*n_ctx + kv_head));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx, Kcur, k));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx, Vcur, v));
}
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
// [64, N, 12]
struct ggml_tensor * Q =
- ggml_permute(ctx0,
- ggml_cpy(ctx0,
+ ggml_permute(ctx,
+ ggml_cont_3d(ctx,
Qcur,
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, n_tokens)),
+ n_embd/n_head, n_head, n_tokens),
0, 2, 1, 3);
// K = Kmem.view(n_embd/n_head, n_head, n_kv).permute(0, 2, 1, 3)
// [64, n_kv, 12]
struct ggml_tensor * K =
- ggml_permute(ctx0,
- ggml_reshape_3d(ctx0,
- ggml_view_1d(ctx0, model.kv_cache.k, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.k)*n_embd),
+ ggml_permute(ctx,
+ ggml_reshape_3d(ctx,
+ ggml_view_1d(ctx, model.kv_cache.k, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.k)*n_embd),
n_embd/n_head, n_head, n_kv),
0, 2, 1, 3);
// K * Q
// [n_kv, n_tokens, 12]
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx, K, Q);
// KQ_scaled = KQ / sqrt(n_embd/n_head)
// [n_kv, n_tokens, 12]
struct ggml_tensor * KQ_scaled =
- ggml_scale(ctx0,
+ ggml_scale(ctx,
KQ,
1.0f/sqrtf(float(n_embd)/n_head));
// KQ_masked = mask_past(KQ_scaled)
// [n_kv, n_tokens, 12]
- struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
+ struct ggml_tensor * KQ_masked = ggml_add(ctx, KQ_scaled, KQ_mask);
// KQ = soft_max(KQ_masked)
// [n_kv, N, 12]
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx, KQ_masked);
// V_trans = Vmem.view(n_embd/n_head, n_head, n_kv).permute(1, 2, 0, 3).contiguous()
// [n_kv, 64, 12]
struct ggml_tensor * V_trans =
- ggml_cpy(ctx0,
- ggml_permute(ctx0,
- ggml_reshape_3d(ctx0,
- ggml_view_1d(ctx0, model.kv_cache.v, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.v)*n_embd),
+ ggml_cont_3d(ctx,
+ ggml_permute(ctx,
+ ggml_reshape_3d(ctx,
+ ggml_view_1d(ctx, model.kv_cache.v, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.v)*n_embd),
n_embd/n_head, n_head, n_kv),
1, 2, 0, 3),
- ggml_new_tensor_3d(ctx0, model.kv_cache.v->type, n_kv, n_embd/n_head, n_head));
+ n_kv, n_embd/n_head, n_head);
// KQV = transpose(V) * KQ_soft_max
// [64, n_tokens, 12]
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx, V_trans, KQ_soft_max);
// KQV_merged = KQV.permute(0, 2, 1, 3)
// [64, 12, n_tokens]
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx, KQV, 0, 2, 1, 3);
// cur = KQV_merged.contiguous().view(n_embd, N)
// [768, n_tokens]
- cur = ggml_cpy(ctx0,
- KQV_merged,
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens));
+ cur = ggml_cont_2d(ctx, KQV_merged, n_embd, n_tokens);
}
// projection
// cur = proj_w*cur + proj_b
// [768, N]
{
- cur = ggml_mul_mat(ctx0,
+ cur = ggml_mul_mat(ctx,
model.layers[il].c_attn_proj_w,
cur);
- cur = ggml_add(ctx0,
+ cur = ggml_add(ctx,
cur,
model.layers[il].c_attn_proj_b);
}
// add the input
- cur = ggml_add(ctx0, cur, inpL);
+ cur = ggml_add(ctx, cur, inpL);
struct ggml_tensor * inpFF = cur;
{
// norm
{
- cur = ggml_norm(ctx0, inpFF, hparams.eps);
+ cur = ggml_norm(ctx, inpFF, hparams.eps);
// cur = ln_2_g*cur + ln_2_b
// [ 768, N]
- cur = ggml_add(ctx0,
- ggml_mul(ctx0,
+ cur = ggml_add(ctx,
+ ggml_mul(ctx,
cur,
model.layers[il].ln_2_g),
model.layers[il].ln_2_b);
//
// cur = fc_w*cur + fc_b
// [3072, N]
- cur = ggml_mul_mat(ctx0,
+ cur = ggml_mul_mat(ctx,
model.layers[il].c_mlp_fc_w,
cur);
- cur = ggml_add(ctx0,
+ cur = ggml_add(ctx,
cur,
model.layers[il].c_mlp_fc_b);
// GELU activation
// [3072, N]
- cur = ggml_gelu(ctx0, cur);
+ cur = ggml_gelu(ctx, cur);
// projection
// [ 768, 3072] - model.layers[il].c_mlp_proj_w
//
// cur = proj_w*cur + proj_b
// [768, N]
- cur = ggml_mul_mat(ctx0,
+ cur = ggml_mul_mat(ctx,
model.layers[il].c_mlp_proj_w,
cur);
- cur = ggml_add(ctx0,
+ cur = ggml_add(ctx,
cur,
model.layers[il].c_mlp_proj_b);
}
// input for next layer
- inpL = ggml_add(ctx0, cur, inpFF);
+ inpL = ggml_add(ctx, cur, inpFF);
}
// norm
{
// [ 768, N]
- inpL = ggml_norm(ctx0, inpL, hparams.eps);
+ inpL = ggml_norm(ctx, inpL, hparams.eps);
// inpL = ln_f_g*inpL + ln_f_b
// [ 768, N]
- inpL = ggml_add(ctx0,
- ggml_mul(ctx0,
+ inpL = ggml_add(ctx,
+ ggml_mul(ctx,
inpL,
model.ln_f_g),
model.ln_f_b);
// inpL = WTE * inpL
// [ 768, 50257] - model.lm_head
// [ 768, N] - inpL
- inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+ inpL = ggml_mul_mat(ctx, model.lm_head, inpL);
// logits -> probs
//inpL = ggml_soft_max(ctx0, inpL);
ggml_build_forward_expand(gf, inpL);
- ggml_free(ctx0);
+ ggml_free(ctx);
return gf;
}
/*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
};
- struct ggml_context * ctx0 = ggml_init(params);
+ struct ggml_context * ctx = ggml_init(params);
- struct ggml_cgraph * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false);
+ struct ggml_cgraph * gf = ggml_new_graph_custom(ctx, GPT2_MAX_NODES, false);
- struct ggml_tensor * embd = ggml_view_1d(ctx0, model.embd, N, 0);
+ struct ggml_tensor * embd = ggml_view_1d(ctx, model.embd, N, 0);
// set inputs
// TODO: move to gpt2_eval
ggml_backend_tensor_set(model.embd, embd_inp.data(), 0, N*ggml_element_size(embd));
- struct ggml_tensor * position = ggml_view_1d(ctx0, model.position, N, 0);
+ struct ggml_tensor * position = ggml_view_1d(ctx, model.position, N, 0);
for (int i = 0; i < N; ++i) {
int32_t v = n_past + i;
ggml_backend_tensor_set(model.position, &v, i*sizeof(int32_t), sizeof(v));
// wte + wpe
struct ggml_tensor * inpL =
- ggml_add(ctx0,
- ggml_get_rows(ctx0, model.wte, embd),
- ggml_get_rows(ctx0, model.wpe, position));
+ ggml_add(ctx,
+ ggml_get_rows(ctx, model.wte, embd),
+ ggml_get_rows(ctx, model.wpe, position));
ggml_set_name(inpL, "inpL");
ggml_set_name(inpL->src[0], "wte");
ggml_set_name(inpL->src[1], "wpe");
// norm
{
// [ 768, N]
- cur = ggml_norm(ctx0, inpL, hparams.eps);
+ cur = ggml_norm(ctx, inpL, hparams.eps);
ggml_format_name(cur, "l%d.norm", il);
// cur = ln_1_g*cur + ln_1_b
// [ 768, N]
- cur = ggml_add(ctx0,
- ggml_mul(ctx0,
+ cur = ggml_add(ctx,
+ ggml_mul(ctx,
cur,
model.layers[il].ln_1_g),
model.layers[il].ln_1_b);
// cur = attn_w*cur + attn_b
// [2304, N]
{
- cur = ggml_mul_mat(ctx0,
+ cur = ggml_mul_mat(ctx,
model.layers[il].c_attn_attn_w,
cur);
ggml_format_name(cur, "l%d.attn_w", il);
- cur = ggml_add(ctx0,
+ cur = ggml_add(ctx,
cur,
model.layers[il].c_attn_attn_b);
ggml_format_name(cur, "l%d.attn_b", il);
// self-attention
{
- struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
- struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
- struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+ struct ggml_tensor * Qcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+ struct ggml_tensor * Kcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+ struct ggml_tensor * Vcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
ggml_format_name(Qcur, "l%d.Qcur", il);
ggml_format_name(Kcur, "l%d.Kcur", il);
// store key and value to memory
if (N >= 1) {
- struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
- struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+ struct ggml_tensor * k = ggml_view_1d(ctx, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+ struct ggml_tensor * v = ggml_view_1d(ctx, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
- ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx, Kcur, k));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx, Vcur, v));
}
// Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
// [64, N, 12]
struct ggml_tensor * Q =
- ggml_permute(ctx0,
- ggml_cpy(ctx0,
- Qcur,
- ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+ ggml_permute(ctx,
+ ggml_cont_3d(ctx, Qcur, n_embd/n_head, n_head, N),
0, 2, 1, 3);
ggml_format_name(Q, "l%d.Q", il);
// K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
// [64, n_past + N, 12]
struct ggml_tensor * K =
- ggml_permute(ctx0,
- ggml_reshape_3d(ctx0,
- ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+ ggml_permute(ctx,
+ ggml_reshape_3d(ctx,
+ ggml_view_1d(ctx, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
n_embd/n_head, n_head, n_past + N),
0, 2, 1, 3);
ggml_format_name(K, "l%d.K", il);
// K * Q
// [n_past + N, N, 12]
- struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+ struct ggml_tensor * KQ = ggml_mul_mat(ctx, K, Q);
ggml_format_name(KQ, "l%d.KQ", il);
// KQ_scaled = KQ / sqrt(n_embd/n_head)
// [n_past + N, N, 12]
- struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
+ struct ggml_tensor * KQ_scaled = ggml_scale(ctx, KQ, KQ_scale);
ggml_format_name(KQ_scaled, "l%d.KQ_scaled", il);
// KQ_masked = mask_past(KQ_scaled)
// [n_past + N, N, 12]
- struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+ struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx, KQ_scaled, n_past);
ggml_format_name(KQ_masked, "l%d.KQ_masked", il);
// KQ = soft_max(KQ_masked)
// [n_past + N, N, 12]
- struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+ struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx, KQ_masked);
ggml_format_name(KQ_soft_max, "l%d.KQ_soft_max", il);
// V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
// [n_past + N, 64, 12]
struct ggml_tensor * V_trans =
- ggml_cpy(ctx0,
- ggml_permute(ctx0,
- ggml_reshape_3d(ctx0,
- ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+ ggml_cont_3d(ctx,
+ ggml_permute(ctx,
+ ggml_reshape_3d(ctx,
+ ggml_view_1d(ctx, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
n_embd/n_head, n_head, n_past + N),
1, 2, 0, 3),
- ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
- ggml_format_name(V_trans, "l%d.V_trans", il);
+ n_past + N, n_embd/n_head, n_head);
// KQV = transpose(V) * KQ_soft_max
// [64, N, 12]
- struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+ struct ggml_tensor * KQV = ggml_mul_mat(ctx, V_trans, KQ_soft_max);
ggml_format_name(KQV, "l%d.KQV", il);
// KQV_merged = KQV.permute(0, 2, 1, 3)
// [64, 12, N]
- struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+ struct ggml_tensor * KQV_merged = ggml_permute(ctx, KQV, 0, 2, 1, 3);
ggml_format_name(KQV_merged, "l%d.KQV_merged", il);
// cur = KQV_merged.contiguous().view(n_embd, N)
// [768, N]
- cur = ggml_cpy(ctx0,
- KQV_merged,
- ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+ cur = ggml_cont_2d(ctx, KQV_merged, n_embd, N);
ggml_format_name(cur, "l%d.KQV_merged_contiguous", il);
}
// cur = proj_w*cur + proj_b
// [768, N]
{
- cur = ggml_mul_mat(ctx0,
+ cur = ggml_mul_mat(ctx,
model.layers[il].c_attn_proj_w,
cur);
ggml_format_name(cur, "l%d.attn_proj_w", il);
- cur = ggml_add(ctx0,
+ cur = ggml_add(ctx,
cur,
model.layers[il].c_attn_proj_b);
ggml_format_name(cur, "l%d.attn_proj_b", il);
}
// add the input
- cur = ggml_add(ctx0, cur, inpL);
+ cur = ggml_add(ctx, cur, inpL);
ggml_format_name(cur, "l%d.add", il);
struct ggml_tensor * inpFF = cur;
{
// norm
{
- cur = ggml_norm(ctx0, inpFF, hparams.eps);
+ cur = ggml_norm(ctx, inpFF, hparams.eps);
ggml_format_name(cur, "l%d.FFnorm", il);
// cur = ln_2_g*cur + ln_2_b
// [ 768, N]
- cur = ggml_add(ctx0,
- ggml_mul(ctx0,
+ cur = ggml_add(ctx,
+ ggml_mul(ctx,
cur,
model.layers[il].ln_2_g),
model.layers[il].ln_2_b);
//
// cur = fc_w*cur + fc_b
// [3072, N]
- cur = ggml_mul_mat(ctx0,
+ cur = ggml_mul_mat(ctx,
model.layers[il].c_mlp_fc_w,
cur);
ggml_format_name(cur, "l%d.mlp_fc_w", il);
- cur = ggml_add(ctx0,
+ cur = ggml_add(ctx,
cur,
model.layers[il].c_mlp_fc_b);
ggml_format_name(cur, "l%d.mlp_fc_b", il);
// GELU activation
// [3072, N]
- cur = ggml_gelu(ctx0, cur);
+ cur = ggml_gelu(ctx, cur);
ggml_format_name(cur, "l%d.gelu", il);
// projection
//
// cur = proj_w*cur + proj_b
// [768, N]
- cur = ggml_mul_mat(ctx0,
+ cur = ggml_mul_mat(ctx,
model.layers[il].c_mlp_proj_w,
cur);
ggml_format_name(cur, "l%d.mlp_proj_w", il);
- cur = ggml_add(ctx0,
+ cur = ggml_add(ctx,
cur,
model.layers[il].c_mlp_proj_b);
ggml_format_name(cur, "l%d.mlp_proj_b", il);
}
// input for next layer
- inpL = ggml_add(ctx0, cur, inpFF);
+ inpL = ggml_add(ctx, cur, inpFF);
ggml_format_name(inpL, "l%d.add2", il);
}
// norm
{
// [ 768, N]
- inpL = ggml_norm(ctx0, inpL, hparams.eps);
+ inpL = ggml_norm(ctx, inpL, hparams.eps);
ggml_format_name(inpL, "out_norm");
// inpL = ln_f_g*inpL + ln_f_b
// [ 768, N]
- inpL = ggml_add(ctx0,
- ggml_mul(ctx0,
+ inpL = ggml_add(ctx,
+ ggml_mul(ctx,
inpL,
model.ln_f_g),
model.ln_f_b);
// inpL = WTE * inpL
// [ 768, 50257] - model.lm_head
// [ 768, N] - inpL
- inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+ inpL = ggml_mul_mat(ctx, model.lm_head, inpL);
ggml_format_name(inpL, "out_lm_head");
// logits -> probs
ggml_build_forward_expand(gf, inpL);
- ggml_free(ctx0);
+ ggml_free(ctx);
return gf;
}
struct node_alloc * node_allocs; // [n_nodes]
int n_nodes;
+
+ struct tensor_alloc * leaf_allocs; // [n_leafs]
+ int n_leafs;
};
ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
free(galloc->buffers);
free(galloc->buf_tallocs);
free(galloc->node_allocs);
+ free(galloc->leaf_allocs);
free(galloc);
}
memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
memset(galloc->hash_values, 0, galloc->hash_set.size * sizeof(struct hash_node));
- // allocate all graph inputs first to avoid overwriting them
- for (int i = 0; i < graph->n_nodes; i++) {
- if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
- ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
- }
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- if (graph->nodes[i]->src[j] == NULL) {
- break;
- }
- if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
- ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
- }
- }
- }
-
// count number of children and views
+ // allocate all graph inputs and leafs first to avoid overwriting them
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
}
+ if (node->flags & GGML_TENSOR_FLAG_INPUT) {
+ ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
+ }
+
for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * parent = node->src[j];
- if (parent == NULL) {
+ struct ggml_tensor * src = node->src[j];
+ if (src == NULL) {
break;
}
- ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
+
+ ggml_gallocr_hash_get(galloc, src)->n_children += 1;
+
+ // allocate explicit inputs and leafs
+ if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
+ ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
+ }
}
- }
+ }
+
+ // allocate the remaining leafs that are unused on the graph
+ // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
+ for (int i = 0; i < graph->n_leafs; i++) {
+ struct ggml_tensor * leaf = graph->leafs[i];
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
+
+ if (hn->n_children == 0) {
+ assert(!hn->allocated);
+ // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
+ ggml_gallocr_allocate_node(galloc, leaf, 0);
+ }
+ }
// allocate tensors
for (int i = 0; i < graph->n_nodes; i++) {
}
}
}
+ if (galloc->n_leafs < graph->n_leafs) {
+ free(galloc->leaf_allocs);
+ galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
+ GGML_ASSERT(galloc->leaf_allocs != NULL);
+ }
+ galloc->n_leafs = graph->n_leafs;
+ for (int i = 0; i < graph->n_leafs; i++) {
+ struct ggml_tensor * leaf = graph->leafs[i];
+ struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
+ galloc->leaf_allocs[i].offset = hn->offset;
+ galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
+ }
// reallocate buffers if needed
for (int i = 0; i < galloc->n_buffers; i++) {
return ggml_gallocr_reserve_n(galloc, graph, NULL);
}
-static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
- assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
+static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
+ assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
if (node->view_src != NULL) {
if (node->buffer == NULL) {
// this tensor was allocated without ggml-backend
return;
}
- ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
+ ggml_backend_view_init(galloc->buffers[buffer_id], node);
}
} else {
if (node->data == NULL) {
assert(tensor_alloc->offset != SIZE_MAX);
- assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
- void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
+ assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
+ void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
void * addr = (char *)base + tensor_alloc->offset;
- ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
+ ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
} else {
if (node->buffer == NULL) {
// this tensor was allocated without ggml-backend
return;
}
-
-#ifndef NDEBUG
- size_t offset =
- (char *)node->data -
- (char *)ggml_backend_buffer_get_base(node->buffer);
- size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
- assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
- assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
-#endif
}
}
}
return true;
}
+ if (galloc->n_leafs != graph->n_leafs) {
+#ifndef NDEBUG
+ fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
+#endif
+ return true;
+ }
+
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
struct node_alloc * node_alloc = &galloc->node_allocs[i];
}
// allocate the graph tensors from the previous assignments
+ // nodes
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
struct node_alloc * node_alloc = &galloc->node_allocs[i];
if (src == NULL) {
break;
}
- ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
+ ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
}
- ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
+ ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
+ }
+ // leafs
+ for (int i = 0; i < graph->n_leafs; i++) {
+ struct ggml_tensor * leaf = graph->leafs[i];
+ struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
+ ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
}
return true;