From: slaren <redacted>
Date: Mon, 12 Feb 2024 17:07:14 +0000 (+0100)
Subject: ggml-alloc : allocate all leafs as if they were inputs (#731)
X-Git-Tag: upstream/0.0.1642~961
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=849215767b689727818888158332ae9172e719f9;p=pkg%2Fggml%2Fsources%2Fggml

ggml-alloc : allocate all leafs as if they were inputs (#731)

* ggml-alloc : allocate all leafs as if they were inputs

* ensure static leafs are allocated

* gpt-2-backend : remove unnecesary ggml_new_tensor

* update other gpt-2 examples to remove ggml_new_tensor calls in the graph
---

diff --git a/examples/gpt-2/main-alloc.cpp b/examples/gpt-2/main-alloc.cpp
index b0ddb52a..7a3197e6 100644
--- a/examples/gpt-2/main-alloc.cpp
+++ b/examples/gpt-2/main-alloc.cpp
@@ -407,11 +407,11 @@ struct ggml_cgraph * gpt2_graph(
         /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
     };
 
-    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_context * ctx = ggml_init(params);
 
-    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx);
 
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
     // at this point, the tensor data is not allocated yet and cannot be set
     // we will find the tensor after the graph is allocated by its name, and set the data then
     ggml_set_name(embd, "embd");
@@ -419,15 +419,15 @@ struct ggml_cgraph * gpt2_graph(
     // this is important to ensure that the input tensors are not overwritten before they are used
     ggml_set_input(embd);
 
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    struct ggml_tensor * position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
     ggml_set_name(position, "position");
     ggml_set_input(position);
 
     // wte + wpe
     struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
+        ggml_add(ctx,
+                ggml_get_rows(ctx, model.wte, embd),
+                ggml_get_rows(ctx, model.wpe, position));
 
     for (int il = 0; il < n_layer; ++il) {
         struct ggml_tensor * cur;
@@ -435,15 +435,15 @@ struct ggml_cgraph * gpt2_graph(
         // norm
         {
             // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
+            cur = ggml_norm(ctx, inpL, hparams.eps);
 
             // cur = ln_1_g*cur + ln_1_b
             // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
+            cur = ggml_add(ctx,
+                    ggml_mul(ctx,
+                        ggml_repeat(ctx, model.layers[il].ln_1_g, cur),
                         cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+                    ggml_repeat(ctx, model.layers[il].ln_1_b, cur));
         }
 
         // attn
@@ -455,45 +455,43 @@ struct ggml_cgraph * gpt2_graph(
         // cur = attn_w*cur + attn_b
         // [2304, N]
         {
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                     model.layers[il].c_attn_attn_w,
                     cur);
 
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
+            cur = ggml_add(ctx,
+                    ggml_repeat(ctx, model.layers[il].c_attn_attn_b, cur),
                     cur);
         }
 
         // self-attention
         {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+            struct ggml_tensor * Qcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_tensor * Kcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            struct ggml_tensor * Vcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
 
             // store key and value to memory
             if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * k = ggml_view_1d(ctx, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_1d(ctx, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
 
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx, Vcur, v));
             }
 
             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
             // [64, N, 12]
             struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+                ggml_permute(ctx,
+                        ggml_cont_3d(ctx, Qcur, n_embd/n_head, n_head, N),
                         0, 2, 1, 3);
 
             // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
             // [64, n_past + N, 12]
             struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+                ggml_permute(ctx,
+                        ggml_reshape_3d(ctx,
+                            ggml_view_1d(ctx, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
                             n_embd/n_head, n_head, n_past + N),
                         0, 2, 1, 3);
 
@@ -511,47 +509,45 @@ struct ggml_cgraph * gpt2_graph(
 
             // K * Q
             // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx, K, Q);
 
             // KQ_scaled = KQ / sqrt(n_embd/n_head)
             // [n_past + N, N, 12]
             struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
+                ggml_scale(ctx,
                         KQ,
                         1.0f/sqrtf(float(n_embd)/n_head));
 
             // KQ_masked = mask_past(KQ_scaled)
             // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx, KQ_scaled, n_past);
 
             // KQ = soft_max(KQ_masked)
             // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx, KQ_masked);
 
             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
             // [n_past + N, 64, 12]
             struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                ggml_cont_3d(ctx,
+                        ggml_permute(ctx,
+                            ggml_reshape_3d(ctx,
+                                ggml_view_1d(ctx, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
                                 n_embd/n_head, n_head, n_past + N),
                             1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
+                        n_past + N, n_embd/n_head, n_head);
 
             // KQV = transpose(V) * KQ_soft_max
             // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx, V_trans, KQ_soft_max);
 
             // KQV_merged = KQV.permute(0, 2, 1, 3)
             // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx, KQV, 0, 2, 1, 3);
 
             // cur = KQV_merged.contiguous().view(n_embd, N)
             // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+            cur = ggml_cont_2d(ctx, KQV_merged, n_embd, N);
         }
 
         // projection
@@ -563,17 +559,17 @@ struct ggml_cgraph * gpt2_graph(
         // cur = proj_w*cur + proj_b
         // [768, N]
         {
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                     model.layers[il].c_attn_proj_w,
                     cur);
 
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
+            cur = ggml_add(ctx,
+                    ggml_repeat(ctx, model.layers[il].c_attn_proj_b, cur),
                     cur);
         }
 
         // add the input
-        cur = ggml_add(ctx0, cur, inpL);
+        cur = ggml_add(ctx, cur, inpL);
 
         struct ggml_tensor * inpFF = cur;
 
@@ -581,15 +577,15 @@ struct ggml_cgraph * gpt2_graph(
         {
             // norm
             {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+                cur = ggml_norm(ctx, inpFF, hparams.eps);
 
                 // cur = ln_2_g*cur + ln_2_b
                 // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
+                cur = ggml_add(ctx,
+                        ggml_mul(ctx,
+                            ggml_repeat(ctx, model.layers[il].ln_2_g, cur),
                             cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
+                        ggml_repeat(ctx, model.layers[il].ln_2_b, cur));
             }
 
             // fully connected
@@ -600,17 +596,17 @@ struct ggml_cgraph * gpt2_graph(
             //
             // cur = fc_w*cur + fc_b
             // [3072, N]
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                     model.layers[il].c_mlp_fc_w,
                     cur);
 
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
+            cur = ggml_add(ctx,
+                    ggml_repeat(ctx, model.layers[il].c_mlp_fc_b, cur),
                     cur);
 
             // GELU activation
             // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
+            cur = ggml_gelu(ctx, cur);
 
             // projection
             // [ 768, 3072] - model.layers[il].c_mlp_proj_w
@@ -620,37 +616,37 @@ struct ggml_cgraph * gpt2_graph(
             //
             // cur = proj_w*cur + proj_b
             // [768, N]
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                     model.layers[il].c_mlp_proj_w,
                     cur);
 
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
+            cur = ggml_add(ctx,
+                    ggml_repeat(ctx, model.layers[il].c_mlp_proj_b, cur),
                     cur);
         }
 
         // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
+        inpL = ggml_add(ctx, cur, inpFF);
     }
 
     // norm
     {
         // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+        inpL = ggml_norm(ctx, inpL, hparams.eps);
 
         // inpL = ln_f_g*inpL + ln_f_b
         // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
+        inpL = ggml_add(ctx,
+                ggml_mul(ctx,
+                    ggml_repeat(ctx, model.ln_f_g, inpL),
                     inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
+                ggml_repeat(ctx, model.ln_f_b, inpL));
     }
 
     // inpL = WTE * inpL
     // [ 768, 50257] - model.lm_head
     // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+    inpL = ggml_mul_mat(ctx, model.lm_head, inpL);
     ggml_set_name(inpL, "logits");
     // setting a tensor as the output will ensure that it is not overwritten by subsequent operations
     ggml_set_output(inpL);
@@ -660,7 +656,7 @@ struct ggml_cgraph * gpt2_graph(
 
     ggml_build_forward_expand(gf, inpL);
 
-    ggml_free(ctx0);
+    ggml_free(ctx);
 
     return gf;
 }
diff --git a/examples/gpt-2/main-backend.cpp b/examples/gpt-2/main-backend.cpp
index cfa618f7..bd4cb80b 100644
--- a/examples/gpt-2/main-backend.cpp
+++ b/examples/gpt-2/main-backend.cpp
@@ -538,9 +538,7 @@ struct ggml_cgraph * gpt2_graph(
             // [64, N, 12]
             struct ggml_tensor * Q =
                 ggml_permute(ctx,
-                        ggml_cpy(ctx,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+                        ggml_cont_3d(ctx, Qcur, n_embd/n_head, n_head, N),
                         0, 2, 1, 3);
 
             // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
@@ -586,13 +584,13 @@ struct ggml_cgraph * gpt2_graph(
             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
             // [n_past + N, 64, 12]
             struct ggml_tensor * V_trans =
-                ggml_cpy(ctx,
+                ggml_cont_3d(ctx,
                         ggml_permute(ctx,
                             ggml_reshape_3d(ctx,
                                 ggml_view_1d(ctx, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
                                 n_embd/n_head, n_head, n_past + N),
                             1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
+                        n_past + N, n_embd/n_head, n_head);
 
             // KQV = transpose(V) * KQ_soft_max
             // [64, N, 12]
@@ -604,9 +602,7 @@ struct ggml_cgraph * gpt2_graph(
 
             // cur = KQV_merged.contiguous().view(n_embd, N)
             // [768, N]
-            cur = ggml_cpy(ctx,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, N));
+            cur = ggml_cont_2d(ctx, KQV_merged, n_embd, N);
         }
 
         // projection
diff --git a/examples/gpt-2/main-batched.cpp b/examples/gpt-2/main-batched.cpp
index 51094467..6ad1838b 100644
--- a/examples/gpt-2/main-batched.cpp
+++ b/examples/gpt-2/main-batched.cpp
@@ -562,35 +562,35 @@ struct ggml_cgraph * gpt2_graph(
         /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
     };
 
-    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_context * ctx = ggml_init(params);
 
-    struct ggml_cgraph  * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false);
+    struct ggml_cgraph  * gf = ggml_new_graph_custom(ctx, GPT2_MAX_NODES, false);
 
     struct ggml_tensor * inpL;
     if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
         ggml_set_name(inp_tokens, "inp_tokens");
         ggml_set_input(inp_tokens);
 
-        struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
         ggml_set_name(position, "position");
         ggml_set_input(position);
 
         // wte + wpe
         inpL =
-            ggml_add(ctx0,
-                    ggml_get_rows(ctx0, model.wte, inp_tokens),
-                    ggml_get_rows(ctx0, model.wpe, position));
+            ggml_add(ctx,
+                    ggml_get_rows(ctx, model.wte, inp_tokens),
+                    ggml_get_rows(ctx, model.wpe, position));
     } else {
         GGML_ASSERT(batch.embd);
 
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
+        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
         ggml_set_name(inpL, "embd");
         ggml_set_input(inpL);
     }
 
     // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_kv, n_tokens, 1);
     ggml_set_name(KQ_mask, "KQ_mask");
     ggml_set_input(KQ_mask);
 
@@ -601,12 +601,12 @@ struct ggml_cgraph * gpt2_graph(
         // norm
         {
             // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
+            cur = ggml_norm(ctx, inpL, hparams.eps);
 
             // cur = ln_1_g*cur + ln_1_b
             // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
+            cur = ggml_add(ctx,
+                    ggml_mul(ctx,
                         cur,
                         model.layers[il].ln_1_g),
                     model.layers[il].ln_1_b);
@@ -621,45 +621,45 @@ struct ggml_cgraph * gpt2_graph(
         // cur = attn_w*cur + attn_b
         // [2304, n_tokens]
         {
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                     model.layers[il].c_attn_attn_w,
                     cur);
 
-            cur = ggml_add(ctx0,
+            cur = ggml_add(ctx,
                     cur,
                     model.layers[il].c_attn_attn_b);
         }
 
         // self-attention
         {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*n_embd);
+            struct ggml_tensor * Qcur = ggml_view_2d(ctx, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_tensor * Kcur = ggml_view_2d(ctx, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*n_embd);
+            struct ggml_tensor * Vcur = ggml_view_2d(ctx, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*n_embd);
 
             // store key and value to memory
             if (n_tokens >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.kv_cache.k, n_tokens*n_embd, (ggml_element_size(model.kv_cache.k)*n_embd)*(il*n_ctx + kv_head));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.kv_cache.v, n_tokens*n_embd, (ggml_element_size(model.kv_cache.v)*n_embd)*(il*n_ctx + kv_head));
+                struct ggml_tensor * k = ggml_view_1d(ctx, model.kv_cache.k, n_tokens*n_embd, (ggml_element_size(model.kv_cache.k)*n_embd)*(il*n_ctx + kv_head));
+                struct ggml_tensor * v = ggml_view_1d(ctx, model.kv_cache.v, n_tokens*n_embd, (ggml_element_size(model.kv_cache.v)*n_embd)*(il*n_ctx + kv_head));
 
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx, Vcur, v));
             }
 
             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
             // [64, N, 12]
             struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
+                ggml_permute(ctx,
+                        ggml_cont_3d(ctx,
                             Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, n_tokens)),
+                            n_embd/n_head, n_head, n_tokens),
                         0, 2, 1, 3);
 
             // K = Kmem.view(n_embd/n_head, n_head, n_kv).permute(0, 2, 1, 3)
             // [64, n_kv, 12]
             struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.kv_cache.k, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.k)*n_embd),
+                ggml_permute(ctx,
+                        ggml_reshape_3d(ctx,
+                            ggml_view_1d(ctx, model.kv_cache.k, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.k)*n_embd),
                             n_embd/n_head, n_head, n_kv),
                         0, 2, 1, 3);
 
@@ -677,47 +677,45 @@ struct ggml_cgraph * gpt2_graph(
 
             // K * Q
             // [n_kv, n_tokens, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx, K, Q);
 
             // KQ_scaled = KQ / sqrt(n_embd/n_head)
             // [n_kv, n_tokens, 12]
             struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
+                ggml_scale(ctx,
                         KQ,
                         1.0f/sqrtf(float(n_embd)/n_head));
 
             // KQ_masked = mask_past(KQ_scaled)
             // [n_kv, n_tokens, 12]
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
+            struct ggml_tensor * KQ_masked = ggml_add(ctx, KQ_scaled, KQ_mask);
 
             // KQ = soft_max(KQ_masked)
             // [n_kv, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx, KQ_masked);
 
             // V_trans = Vmem.view(n_embd/n_head, n_head, n_kv).permute(1, 2, 0, 3).contiguous()
             // [n_kv, 64, 12]
             struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.kv_cache.v, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.v)*n_embd),
+                ggml_cont_3d(ctx,
+                        ggml_permute(ctx,
+                            ggml_reshape_3d(ctx,
+                                ggml_view_1d(ctx, model.kv_cache.v, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.v)*n_embd),
                                 n_embd/n_head, n_head, n_kv),
                             1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.kv_cache.v->type, n_kv, n_embd/n_head, n_head));
+                        n_kv, n_embd/n_head, n_head);
 
             // KQV = transpose(V) * KQ_soft_max
             // [64, n_tokens, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx, V_trans, KQ_soft_max);
 
             // KQV_merged = KQV.permute(0, 2, 1, 3)
             // [64, 12, n_tokens]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx, KQV, 0, 2, 1, 3);
 
             // cur = KQV_merged.contiguous().view(n_embd, N)
             // [768, n_tokens]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens));
+            cur = ggml_cont_2d(ctx, KQV_merged, n_embd, n_tokens);
         }
 
         // projection
@@ -729,17 +727,17 @@ struct ggml_cgraph * gpt2_graph(
         // cur = proj_w*cur + proj_b
         // [768, N]
         {
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                     model.layers[il].c_attn_proj_w,
                     cur);
 
-            cur = ggml_add(ctx0,
+            cur = ggml_add(ctx,
                     cur,
                     model.layers[il].c_attn_proj_b);
         }
 
         // add the input
-        cur = ggml_add(ctx0, cur, inpL);
+        cur = ggml_add(ctx, cur, inpL);
 
         struct ggml_tensor * inpFF = cur;
 
@@ -747,12 +745,12 @@ struct ggml_cgraph * gpt2_graph(
         {
             // norm
             {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+                cur = ggml_norm(ctx, inpFF, hparams.eps);
 
                 // cur = ln_2_g*cur + ln_2_b
                 // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
+                cur = ggml_add(ctx,
+                        ggml_mul(ctx,
                             cur,
                             model.layers[il].ln_2_g),
                         model.layers[il].ln_2_b);
@@ -766,17 +764,17 @@ struct ggml_cgraph * gpt2_graph(
             //
             // cur = fc_w*cur + fc_b
             // [3072, N]
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                     model.layers[il].c_mlp_fc_w,
                     cur);
 
-            cur = ggml_add(ctx0,
+            cur = ggml_add(ctx,
                     cur,
                     model.layers[il].c_mlp_fc_b);
 
             // GELU activation
             // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
+            cur = ggml_gelu(ctx, cur);
 
             // projection
             // [ 768, 3072] - model.layers[il].c_mlp_proj_w
@@ -786,28 +784,28 @@ struct ggml_cgraph * gpt2_graph(
             //
             // cur = proj_w*cur + proj_b
             // [768, N]
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                     model.layers[il].c_mlp_proj_w,
                     cur);
 
-            cur = ggml_add(ctx0,
+            cur = ggml_add(ctx,
                     cur,
                     model.layers[il].c_mlp_proj_b);
         }
 
         // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
+        inpL = ggml_add(ctx, cur, inpFF);
     }
 
     // norm
     {
         // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+        inpL = ggml_norm(ctx, inpL, hparams.eps);
 
         // inpL = ln_f_g*inpL + ln_f_b
         // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
+        inpL = ggml_add(ctx,
+                ggml_mul(ctx,
                     inpL,
                     model.ln_f_g),
                 model.ln_f_b);
@@ -816,14 +814,14 @@ struct ggml_cgraph * gpt2_graph(
     // inpL = WTE * inpL
     // [ 768, 50257] - model.lm_head
     // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+    inpL = ggml_mul_mat(ctx, model.lm_head, inpL);
 
     // logits -> probs
     //inpL = ggml_soft_max(ctx0, inpL);
 
     ggml_build_forward_expand(gf, inpL);
 
-    ggml_free(ctx0);
+    ggml_free(ctx);
 
     return gf;
 }
diff --git a/examples/gpt-2/main-sched.cpp b/examples/gpt-2/main-sched.cpp
index b5b8af61..e753d5fb 100644
--- a/examples/gpt-2/main-sched.cpp
+++ b/examples/gpt-2/main-sched.cpp
@@ -557,17 +557,17 @@ struct ggml_cgraph * gpt2_graph(
         /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
     };
 
-    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_context * ctx = ggml_init(params);
 
-    struct ggml_cgraph  * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false);
+    struct ggml_cgraph  * gf = ggml_new_graph_custom(ctx, GPT2_MAX_NODES, false);
 
-    struct ggml_tensor * embd = ggml_view_1d(ctx0, model.embd, N, 0);
+    struct ggml_tensor * embd = ggml_view_1d(ctx, model.embd, N, 0);
 
     // set inputs
     // TODO: move to gpt2_eval
     ggml_backend_tensor_set(model.embd, embd_inp.data(), 0, N*ggml_element_size(embd));
 
-    struct ggml_tensor * position = ggml_view_1d(ctx0, model.position, N, 0);
+    struct ggml_tensor * position = ggml_view_1d(ctx, model.position, N, 0);
     for (int i = 0; i < N; ++i) {
         int32_t v = n_past + i;
         ggml_backend_tensor_set(model.position, &v, i*sizeof(int32_t), sizeof(v));
@@ -577,9 +577,9 @@ struct ggml_cgraph * gpt2_graph(
 
     // wte + wpe
     struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
+        ggml_add(ctx,
+                ggml_get_rows(ctx, model.wte, embd),
+                ggml_get_rows(ctx, model.wpe, position));
     ggml_set_name(inpL, "inpL");
     ggml_set_name(inpL->src[0], "wte");
     ggml_set_name(inpL->src[1], "wpe");
@@ -590,13 +590,13 @@ struct ggml_cgraph * gpt2_graph(
         // norm
         {
             // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
+            cur = ggml_norm(ctx, inpL, hparams.eps);
             ggml_format_name(cur, "l%d.norm", il);
 
             // cur = ln_1_g*cur + ln_1_b
             // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
+            cur = ggml_add(ctx,
+                    ggml_mul(ctx,
                         cur,
                         model.layers[il].ln_1_g),
                     model.layers[il].ln_1_b);
@@ -613,12 +613,12 @@ struct ggml_cgraph * gpt2_graph(
         // cur = attn_w*cur + attn_b
         // [2304, N]
         {
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                     model.layers[il].c_attn_attn_w,
                     cur);
             ggml_format_name(cur, "l%d.attn_w", il);
 
-            cur = ggml_add(ctx0,
+            cur = ggml_add(ctx,
                     cur,
                     model.layers[il].c_attn_attn_b);
             ggml_format_name(cur, "l%d.attn_b", il);
@@ -626,9 +626,9 @@ struct ggml_cgraph * gpt2_graph(
 
         // self-attention
         {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+            struct ggml_tensor * Qcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_tensor * Kcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            struct ggml_tensor * Vcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
 
             ggml_format_name(Qcur, "l%d.Qcur", il);
             ggml_format_name(Kcur, "l%d.Kcur", il);
@@ -636,29 +636,27 @@ struct ggml_cgraph * gpt2_graph(
 
             // store key and value to memory
             if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * k = ggml_view_1d(ctx, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_1d(ctx, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
 
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx, Vcur, v));
             }
 
             // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
             // [64, N, 12]
             struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+                ggml_permute(ctx,
+                        ggml_cont_3d(ctx, Qcur, n_embd/n_head, n_head, N),
                         0, 2, 1, 3);
             ggml_format_name(Q, "l%d.Q", il);
 
             // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
             // [64, n_past + N, 12]
             struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+                ggml_permute(ctx,
+                        ggml_reshape_3d(ctx,
+                            ggml_view_1d(ctx, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
                             n_embd/n_head, n_head, n_past + N),
                         0, 2, 1, 3);
             ggml_format_name(K, "l%d.K", il);
@@ -677,51 +675,48 @@ struct ggml_cgraph * gpt2_graph(
 
             // K * Q
             // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx, K, Q);
             ggml_format_name(KQ, "l%d.KQ", il);
 
             // KQ_scaled = KQ / sqrt(n_embd/n_head)
             // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
+            struct ggml_tensor * KQ_scaled = ggml_scale(ctx, KQ, KQ_scale);
             ggml_format_name(KQ_scaled, "l%d.KQ_scaled", il);
 
             // KQ_masked = mask_past(KQ_scaled)
             // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx, KQ_scaled, n_past);
             ggml_format_name(KQ_masked, "l%d.KQ_masked", il);
 
             // KQ = soft_max(KQ_masked)
             // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx, KQ_masked);
             ggml_format_name(KQ_soft_max, "l%d.KQ_soft_max", il);
 
             // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
             // [n_past + N, 64, 12]
             struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                ggml_cont_3d(ctx,
+                        ggml_permute(ctx,
+                            ggml_reshape_3d(ctx,
+                                ggml_view_1d(ctx, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
                                 n_embd/n_head, n_head, n_past + N),
                             1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
-            ggml_format_name(V_trans, "l%d.V_trans", il);
+                        n_past + N, n_embd/n_head, n_head);
 
             // KQV = transpose(V) * KQ_soft_max
             // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx, V_trans, KQ_soft_max);
             ggml_format_name(KQV, "l%d.KQV", il);
 
             // KQV_merged = KQV.permute(0, 2, 1, 3)
             // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx, KQV, 0, 2, 1, 3);
             ggml_format_name(KQV_merged, "l%d.KQV_merged", il);
 
             // cur = KQV_merged.contiguous().view(n_embd, N)
             // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+            cur = ggml_cont_2d(ctx, KQV_merged, n_embd, N);
             ggml_format_name(cur, "l%d.KQV_merged_contiguous", il);
         }
 
@@ -734,19 +729,19 @@ struct ggml_cgraph * gpt2_graph(
         // cur = proj_w*cur + proj_b
         // [768, N]
         {
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                     model.layers[il].c_attn_proj_w,
                     cur);
             ggml_format_name(cur, "l%d.attn_proj_w", il);
 
-            cur = ggml_add(ctx0,
+            cur = ggml_add(ctx,
                     cur,
                     model.layers[il].c_attn_proj_b);
             ggml_format_name(cur, "l%d.attn_proj_b", il);
         }
 
         // add the input
-        cur = ggml_add(ctx0, cur, inpL);
+        cur = ggml_add(ctx, cur, inpL);
         ggml_format_name(cur, "l%d.add", il);
 
         struct ggml_tensor * inpFF = cur;
@@ -755,13 +750,13 @@ struct ggml_cgraph * gpt2_graph(
         {
             // norm
             {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+                cur = ggml_norm(ctx, inpFF, hparams.eps);
                 ggml_format_name(cur, "l%d.FFnorm", il);
 
                 // cur = ln_2_g*cur + ln_2_b
                 // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
+                cur = ggml_add(ctx,
+                        ggml_mul(ctx,
                             cur,
                             model.layers[il].ln_2_g),
                         model.layers[il].ln_2_b);
@@ -777,19 +772,19 @@ struct ggml_cgraph * gpt2_graph(
             //
             // cur = fc_w*cur + fc_b
             // [3072, N]
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                     model.layers[il].c_mlp_fc_w,
                     cur);
             ggml_format_name(cur, "l%d.mlp_fc_w", il);
 
-            cur = ggml_add(ctx0,
+            cur = ggml_add(ctx,
                     cur,
                     model.layers[il].c_mlp_fc_b);
             ggml_format_name(cur, "l%d.mlp_fc_b", il);
 
             // GELU activation
             // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
+            cur = ggml_gelu(ctx, cur);
             ggml_format_name(cur, "l%d.gelu", il);
 
             // projection
@@ -800,32 +795,32 @@ struct ggml_cgraph * gpt2_graph(
             //
             // cur = proj_w*cur + proj_b
             // [768, N]
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                     model.layers[il].c_mlp_proj_w,
                     cur);
             ggml_format_name(cur, "l%d.mlp_proj_w", il);
 
-            cur = ggml_add(ctx0,
+            cur = ggml_add(ctx,
                     cur,
                     model.layers[il].c_mlp_proj_b);
             ggml_format_name(cur, "l%d.mlp_proj_b", il);
         }
 
         // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
+        inpL = ggml_add(ctx, cur, inpFF);
         ggml_format_name(inpL, "l%d.add2", il);
     }
 
     // norm
     {
         // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+        inpL = ggml_norm(ctx, inpL, hparams.eps);
         ggml_format_name(inpL, "out_norm");
 
         // inpL = ln_f_g*inpL + ln_f_b
         // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
+        inpL = ggml_add(ctx,
+                ggml_mul(ctx,
                     inpL,
                     model.ln_f_g),
                 model.ln_f_b);
@@ -836,7 +831,7 @@ struct ggml_cgraph * gpt2_graph(
     // inpL = WTE * inpL
     // [ 768, 50257] - model.lm_head
     // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+    inpL = ggml_mul_mat(ctx, model.lm_head, inpL);
     ggml_format_name(inpL, "out_lm_head");
 
     // logits -> probs
@@ -844,7 +839,7 @@ struct ggml_cgraph * gpt2_graph(
 
     ggml_build_forward_expand(gf, inpL);
 
-    ggml_free(ctx0);
+    ggml_free(ctx);
 
     return gf;
 }
diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c
index c28c37c4..56d59669 100644
--- a/src/ggml-alloc.c
+++ b/src/ggml-alloc.c
@@ -377,6 +377,9 @@ struct ggml_gallocr {
 
     struct node_alloc * node_allocs; // [n_nodes]
     int n_nodes;
+
+    struct tensor_alloc * leaf_allocs; // [n_leafs]
+    int n_leafs;
 };
 
 ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
@@ -427,6 +430,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
     free(galloc->buffers);
     free(galloc->buf_tallocs);
     free(galloc->node_allocs);
+    free(galloc->leaf_allocs);
     free(galloc);
 }
 
@@ -544,22 +548,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
     memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
     memset(galloc->hash_values,   0, galloc->hash_set.size * sizeof(struct hash_node));
 
-    // allocate all graph inputs first to avoid overwriting them
-    for (int i = 0; i < graph->n_nodes; i++) {
-        if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
-            ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
-        }
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            if (graph->nodes[i]->src[j] == NULL) {
-                break;
-            }
-            if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
-                ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
-            }
-        }
-    }
-
     // count number of children and views
+    // allocate all graph inputs and leafs first to avoid overwriting them
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
 
@@ -568,14 +558,37 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
             ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
         }
 
+        if (node->flags & GGML_TENSOR_FLAG_INPUT) {
+            ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
+        }
+
         for (int j = 0; j < GGML_MAX_SRC; j++) {
-            struct ggml_tensor * parent = node->src[j];
-            if (parent == NULL) {
+            struct ggml_tensor * src = node->src[j];
+            if (src == NULL) {
                 break;
             }
-            ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
+
+            ggml_gallocr_hash_get(galloc, src)->n_children += 1;
+
+            // allocate explicit inputs and leafs
+            if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
+                ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
+            }
         }
-   }
+    }
+
+    // allocate the remaining leafs that are unused on the graph
+    // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct ggml_tensor * leaf = graph->leafs[i];
+        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
+
+        if (hn->n_children == 0) {
+            assert(!hn->allocated);
+            // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
+            ggml_gallocr_allocate_node(galloc, leaf, 0);
+        }
+    }
 
     // allocate tensors
     for (int i = 0; i < graph->n_nodes; i++) {
@@ -696,6 +709,18 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
             }
         }
     }
+    if (galloc->n_leafs < graph->n_leafs) {
+        free(galloc->leaf_allocs);
+        galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
+        GGML_ASSERT(galloc->leaf_allocs != NULL);
+    }
+    galloc->n_leafs = graph->n_leafs;
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct ggml_tensor * leaf = graph->leafs[i];
+        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
+        galloc->leaf_allocs[i].offset = hn->offset;
+        galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
+    }
 
     // reallocate buffers if needed
     for (int i = 0; i < galloc->n_buffers; i++) {
@@ -722,8 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
     return ggml_gallocr_reserve_n(galloc, graph, NULL);
 }
 
-static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
-    assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
+static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
+    assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
 
     if (node->view_src != NULL) {
         if (node->buffer == NULL) {
@@ -732,29 +757,20 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
                 // this tensor was allocated without ggml-backend
                 return;
             }
-            ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
+            ggml_backend_view_init(galloc->buffers[buffer_id], node);
         }
     } else {
         if (node->data == NULL) {
             assert(tensor_alloc->offset != SIZE_MAX);
-            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
-            void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
+            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
+            void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
             void * addr = (char *)base + tensor_alloc->offset;
-            ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
+            ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
         } else {
             if (node->buffer == NULL) {
                 // this tensor was allocated without ggml-backend
                 return;
             }
-
-#ifndef NDEBUG
-            size_t offset =
-                (char *)node->data -
-                (char *)ggml_backend_buffer_get_base(node->buffer);
-            size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
-            assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
-            assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
-#endif
         }
     }
 }
@@ -773,6 +789,13 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
         return true;
     }
 
+    if (galloc->n_leafs != graph->n_leafs) {
+#ifndef NDEBUG
+        fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
+#endif
+        return true;
+    }
+
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
         struct node_alloc * node_alloc = &galloc->node_allocs[i];
@@ -827,6 +850,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
     }
 
     // allocate the graph tensors from the previous assignments
+    // nodes
     for (int i = 0; i < graph->n_nodes; i++) {
         struct ggml_tensor * node = graph->nodes[i];
         struct node_alloc * node_alloc = &galloc->node_allocs[i];
@@ -835,9 +859,15 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
             if (src == NULL) {
                 break;
             }
-            ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
+            ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
         }
-        ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
+        ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
+    }
+    // leafs
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct ggml_tensor * leaf = graph->leafs[i];
+        struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
+        ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
     }
 
     return true;