static size_t buf_size = 256u*1024*1024;
static void * buf = malloc(buf_size);
+ // use 2 scratch buffers
+ // TODO: very hacky solution - reimplement in a more elegant way
+ static size_t scr0_size = 256u*1024*1024;
+ static void * scr0 = malloc(scr0_size);
+
+ static size_t scr1_size = 256u*1024*1024;
+ static void * scr1 = malloc(scr1_size);
+
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur;
+ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+
// self-attention
{
{
}
}
+ ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
+
if (hparams.par_res == 0) {
struct ggml_tensor * inpFF = ggml_add(ctx0, cur, inpL);
}
}
+ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+
// norm
{
inpL = ggml_norm(ctx0, inpL);
ggml_repeat(ctx0, model.ln_f_b, inpL));
}
+ ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+
// lm_head
{
inpL = ggml_mul_mat(ctx0, model.lmh_g, inpL);
static size_t buf_size = 256u * 1024 * 1024;
static void * buf = malloc(buf_size);
+ // use 2 scratch buffers
+ // TODO: very hacky solution - reimplement in a more elegant way
+ static size_t scr0_size = 256u*1024*1024;
+ static void * scr0 = malloc(scr0_size);
+
+ static size_t scr1_size = 256u*1024*1024;
+ static void * scr1 = malloc(scr1_size);
+
if (mem_per_token > 0 && mem_per_token * N > buf_size) {
const size_t buf_size_new = 1.1 * (mem_per_token * N); // add 10% to account for ggml object overhead
// printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__,
struct ggml_tensor * cur;
+ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+
// a = self.ln_1(x)
{
cur = ggml_norm(ctx0, inpL);
// attn_bias=attn_bias, attention_mask=attention_mask,
// is_causal=is_causal)
{
-
// compute QKV
cur = ggml_mul_mat(ctx0, model.layers[il].c_attn_wqkv_weight, cur);
inpL = ggml_add(ctx0, inpL, cur);
+ ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
+
// m = self.ln_2(x)
{
cur = ggml_norm(ctx0, inpL);
inpL = ggml_add(ctx0, inpL, cur);
}
+ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+
// norm
{
inpL = ggml_norm(ctx0, inpL);
inpL = ggml_mul(ctx0, ggml_repeat(ctx0, model.norm_f_weight, inpL), inpL);
}
+ ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+
// output embedding weight tied to input embedding
inpL = ggml_mul_mat(ctx0, model.wte_weight, inpL);
static size_t buf_size = 256u*1024*1024;
static void * buf = malloc(buf_size);
+ // use 2 scratch buffers
+ // TODO: very hacky solution - reimplement in a more elegant way
+ static size_t scr0_size = 256u*1024*1024;
+ static void * scr0 = malloc(scr0_size);
+
+ static size_t scr1_size = 256u*1024*1024;
+ static void * scr1 = malloc(scr1_size);
+
if (mem_per_token > 0 && mem_per_token*N > buf_size) {
const size_t buf_size_new = 1.1*(mem_per_token*N); // add 10% to account for ggml object overhead
//printf("\n%s: reallocating buffer from %zu to %zu bytes\n", __func__, buf_size, buf_size_new);
for (int il = 0; il < n_layer; ++il) {
struct ggml_tensor * cur;
+ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+
// norm
{
// [ 768, N]
ggml_reshape_3d(ctx0,
ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
n_embd/n_head, n_head, n_past + N),
- 0, 2, 1, 3); //TODO: need to be tiled
+ 0, 2, 1, 3); //TODO: need to be tiled
// GG: flash attention
//struct ggml_tensor * V =
struct ggml_tensor * inpFF = cur;
+ ggml_set_scratch(ctx0, { 0, scr1_size, scr1, });
+
// feed-forward network
{
// norm
inpL = ggml_add(ctx0, cur, inpFF);
}
+ ggml_set_scratch(ctx0, { 0, scr0_size, scr0, });
+
// norm
{
// [ 768, N]
ggml_repeat(ctx0, model.ln_f_b, inpL));
}
+ ggml_set_scratch(ctx0, { 0, 0, nullptr, });
+
// inpL = WTE * inpL
// [ 768, 50257] - model.lm_head
// [ 768, N] - inpL
if (mem_per_token == 0) {
mem_per_token = ggml_used_mem(ctx0)/N;
}
- //printf("used_mem = %zu\n", ggml_used_mem(ctx0));
+ //printf("used_mem = %zu MB\n", ggml_used_mem(ctx0)/(1024*1024));
ggml_free(ctx0);
};
} else {
if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
- GGML_PRINT("%s: not enough space in the scratch memory\n", __func__);
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
+ __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
assert(false);
return NULL;
}