ggml-alloc : allocate all leafs as if they were inputs (#731)

author slaren <redacted>

Mon, 12 Feb 2024 17:07:14 +0000 (18:07 +0100)

committer GitHub <redacted>

Mon, 12 Feb 2024 17:07:14 +0000 (19:07 +0200)
author slaren <redacted>
Mon, 12 Feb 2024 17:07:14 +0000 (18:07 +0100)
committer GitHub <redacted>
Mon, 12 Feb 2024 17:07:14 +0000 (19:07 +0200)
diff --git a/examples/gpt-2/main-alloc.cpp b/examples/gpt-2/main-alloc.cpp

index b0ddb52aa4987567dba5aa30d8fef0e10d18a38d..7a3197e65c15c25d33faeccf8db13febdee5aafe 100644 (file)
--- a/examples/gpt-2/main-alloc.cpp
+++ b/examples/gpt-2/main-alloc.cpp
@@ -407,11 +407,11 @@ struct ggml_cgraph * gpt2_graph(
          /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
      };
  
-    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_context * ctx = ggml_init(params);
  
-    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx);
  
-    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    struct ggml_tensor * embd = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
      // at this point, the tensor data is not allocated yet and cannot be set
      // we will find the tensor after the graph is allocated by its name, and set the data then
      ggml_set_name(embd, "embd");
@@ -419,15 +419,15 @@ struct ggml_cgraph * gpt2_graph(
      // this is important to ensure that the input tensors are not overwritten before they are used
      ggml_set_input(embd);
  
-    struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
+    struct ggml_tensor * position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, N);
      ggml_set_name(position, "position");
      ggml_set_input(position);
  
      // wte + wpe
      struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
+        ggml_add(ctx,
+                ggml_get_rows(ctx, model.wte, embd),
+                ggml_get_rows(ctx, model.wpe, position));
  
      for (int il = 0; il < n_layer; ++il) {
          struct ggml_tensor * cur;
@@ -435,15 +435,15 @@ struct ggml_cgraph * gpt2_graph(
          // norm
          {
              // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
+            cur = ggml_norm(ctx, inpL, hparams.eps);
  
              // cur = ln_1_g*cur + ln_1_b
              // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
-                        ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
+            cur = ggml_add(ctx,
+                    ggml_mul(ctx,
+                        ggml_repeat(ctx, model.layers[il].ln_1_g, cur),
                          cur),
-                    ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+                    ggml_repeat(ctx, model.layers[il].ln_1_b, cur));
          }
  
          // attn
@@ -455,45 +455,43 @@ struct ggml_cgraph * gpt2_graph(
          // cur = attn_w*cur + attn_b
          // [2304, N]
          {
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                      model.layers[il].c_attn_attn_w,
                      cur);
  
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
+            cur = ggml_add(ctx,
+                    ggml_repeat(ctx, model.layers[il].c_attn_attn_b, cur),
                      cur);
          }
  
          // self-attention
          {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+            struct ggml_tensor * Qcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_tensor * Kcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            struct ggml_tensor * Vcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
  
              // store key and value to memory
              if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * k = ggml_view_1d(ctx, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_1d(ctx, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
  
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx, Vcur, v));
              }
  
              // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
              // [64, N, 12]
              struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+                ggml_permute(ctx,
+                        ggml_cont_3d(ctx, Qcur, n_embd/n_head, n_head, N),
                          0, 2, 1, 3);
  
              // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
              // [64, n_past + N, 12]
              struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+                ggml_permute(ctx,
+                        ggml_reshape_3d(ctx,
+                            ggml_view_1d(ctx, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
                              n_embd/n_head, n_head, n_past + N),
                          0, 2, 1, 3);
  
@@ -511,47 +509,45 @@ struct ggml_cgraph * gpt2_graph(
  
              // K * Q
              // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx, K, Q);
  
              // KQ_scaled = KQ / sqrt(n_embd/n_head)
              // [n_past + N, N, 12]
              struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
+                ggml_scale(ctx,
                          KQ,
                          1.0f/sqrtf(float(n_embd)/n_head));
  
              // KQ_masked = mask_past(KQ_scaled)
              // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx, KQ_scaled, n_past);
  
              // KQ = soft_max(KQ_masked)
              // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx, KQ_masked);
  
              // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
              // [n_past + N, 64, 12]
              struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                ggml_cont_3d(ctx,
+                        ggml_permute(ctx,
+                            ggml_reshape_3d(ctx,
+                                ggml_view_1d(ctx, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
                                  n_embd/n_head, n_head, n_past + N),
                              1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
+                        n_past + N, n_embd/n_head, n_head);
  
              // KQV = transpose(V) * KQ_soft_max
              // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx, V_trans, KQ_soft_max);
  
              // KQV_merged = KQV.permute(0, 2, 1, 3)
              // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx, KQV, 0, 2, 1, 3);
  
              // cur = KQV_merged.contiguous().view(n_embd, N)
              // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+            cur = ggml_cont_2d(ctx, KQV_merged, n_embd, N);
          }
  
          // projection
@@ -563,17 +559,17 @@ struct ggml_cgraph * gpt2_graph(
          // cur = proj_w*cur + proj_b
          // [768, N]
          {
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                      model.layers[il].c_attn_proj_w,
                      cur);
  
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
+            cur = ggml_add(ctx,
+                    ggml_repeat(ctx, model.layers[il].c_attn_proj_b, cur),
                      cur);
          }
  
          // add the input
-        cur = ggml_add(ctx0, cur, inpL);
+        cur = ggml_add(ctx, cur, inpL);
  
          struct ggml_tensor * inpFF = cur;
  
@@ -581,15 +577,15 @@ struct ggml_cgraph * gpt2_graph(
          {
              // norm
              {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+                cur = ggml_norm(ctx, inpFF, hparams.eps);
  
                  // cur = ln_2_g*cur + ln_2_b
                  // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
-                            ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
+                cur = ggml_add(ctx,
+                        ggml_mul(ctx,
+                            ggml_repeat(ctx, model.layers[il].ln_2_g, cur),
                              cur),
-                        ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
+                        ggml_repeat(ctx, model.layers[il].ln_2_b, cur));
              }
  
              // fully connected
@@ -600,17 +596,17 @@ struct ggml_cgraph * gpt2_graph(
              //
              // cur = fc_w*cur + fc_b
              // [3072, N]
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                      model.layers[il].c_mlp_fc_w,
                      cur);
  
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
+            cur = ggml_add(ctx,
+                    ggml_repeat(ctx, model.layers[il].c_mlp_fc_b, cur),
                      cur);
  
              // GELU activation
              // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
+            cur = ggml_gelu(ctx, cur);
  
              // projection
              // [ 768, 3072] - model.layers[il].c_mlp_proj_w
@@ -620,37 +616,37 @@ struct ggml_cgraph * gpt2_graph(
              //
              // cur = proj_w*cur + proj_b
              // [768, N]
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                      model.layers[il].c_mlp_proj_w,
                      cur);
  
-            cur = ggml_add(ctx0,
-                    ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
+            cur = ggml_add(ctx,
+                    ggml_repeat(ctx, model.layers[il].c_mlp_proj_b, cur),
                      cur);
          }
  
          // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
+        inpL = ggml_add(ctx, cur, inpFF);
      }
  
      // norm
      {
          // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+        inpL = ggml_norm(ctx, inpL, hparams.eps);
  
          // inpL = ln_f_g*inpL + ln_f_b
          // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
-                    ggml_repeat(ctx0, model.ln_f_g, inpL),
+        inpL = ggml_add(ctx,
+                ggml_mul(ctx,
+                    ggml_repeat(ctx, model.ln_f_g, inpL),
                      inpL),
-                ggml_repeat(ctx0, model.ln_f_b, inpL));
+                ggml_repeat(ctx, model.ln_f_b, inpL));
      }
  
      // inpL = WTE * inpL
      // [ 768, 50257] - model.lm_head
      // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+    inpL = ggml_mul_mat(ctx, model.lm_head, inpL);
      ggml_set_name(inpL, "logits");
      // setting a tensor as the output will ensure that it is not overwritten by subsequent operations
      ggml_set_output(inpL);
@@ -660,7 +656,7 @@ struct ggml_cgraph * gpt2_graph(
  
      ggml_build_forward_expand(gf, inpL);
  
-    ggml_free(ctx0);
+    ggml_free(ctx);
  
      return gf;
  }
diff --git a/examples/gpt-2/main-backend.cpp b/examples/gpt-2/main-backend.cpp

index cfa618f72841907d1d9d37d4e0d08ded9e58c8a9..bd4cb80b17d47d4d23bb30e61c31207945240614 100644 (file)
--- a/examples/gpt-2/main-backend.cpp
+++ b/examples/gpt-2/main-backend.cpp
@@ -538,9 +538,7 @@ struct ggml_cgraph * gpt2_graph(
              // [64, N, 12]
              struct ggml_tensor * Q =
                  ggml_permute(ctx,
-                        ggml_cpy(ctx,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+                        ggml_cont_3d(ctx, Qcur, n_embd/n_head, n_head, N),
                          0, 2, 1, 3);
  
              // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
@@ -586,13 +584,13 @@ struct ggml_cgraph * gpt2_graph(
              // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
              // [n_past + N, 64, 12]
              struct ggml_tensor * V_trans =
-                ggml_cpy(ctx,
+                ggml_cont_3d(ctx,
                          ggml_permute(ctx,
                              ggml_reshape_3d(ctx,
                                  ggml_view_1d(ctx, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
                                  n_embd/n_head, n_head, n_past + N),
                              1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
+                        n_past + N, n_embd/n_head, n_head);
  
              // KQV = transpose(V) * KQ_soft_max
              // [64, N, 12]
@@ -604,9 +602,7 @@ struct ggml_cgraph * gpt2_graph(
  
              // cur = KQV_merged.contiguous().view(n_embd, N)
              // [768, N]
-            cur = ggml_cpy(ctx,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, N));
+            cur = ggml_cont_2d(ctx, KQV_merged, n_embd, N);
          }
  
          // projection
diff --git a/examples/gpt-2/main-batched.cpp b/examples/gpt-2/main-batched.cpp

index 51094467c0767cf82f59200b959dc57ea3d2e7ee..6ad1838b71c390c3fed23b2e14949daee1698ddf 100644 (file)
--- a/examples/gpt-2/main-batched.cpp
+++ b/examples/gpt-2/main-batched.cpp
@@ -562,35 +562,35 @@ struct ggml_cgraph * gpt2_graph(
          /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
      };
  
-    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_context * ctx = ggml_init(params);
  
-    struct ggml_cgraph  * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false);
+    struct ggml_cgraph  * gf = ggml_new_graph_custom(ctx, GPT2_MAX_NODES, false);
  
      struct ggml_tensor * inpL;
      if (batch.token) {
-        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * inp_tokens = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
          ggml_set_name(inp_tokens, "inp_tokens");
          ggml_set_input(inp_tokens);
  
-        struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, n_tokens);
+        struct ggml_tensor * position = ggml_new_tensor_1d(ctx, GGML_TYPE_I32, n_tokens);
          ggml_set_name(position, "position");
          ggml_set_input(position);
  
          // wte + wpe
          inpL =
-            ggml_add(ctx0,
-                    ggml_get_rows(ctx0, model.wte, inp_tokens),
-                    ggml_get_rows(ctx0, model.wpe, position));
+            ggml_add(ctx,
+                    ggml_get_rows(ctx, model.wte, inp_tokens),
+                    ggml_get_rows(ctx, model.wpe, position));
      } else {
          GGML_ASSERT(batch.embd);
  
-        inpL = ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens);
+        inpL = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, n_embd, n_tokens);
          ggml_set_name(inpL, "embd");
          ggml_set_input(inpL);
      }
  
      // KQ_mask (mask for 1 head, it will be broadcasted to all heads)
-    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_kv, n_tokens, 1);
+    struct ggml_tensor * KQ_mask = ggml_new_tensor_3d(ctx, GGML_TYPE_F32, n_kv, n_tokens, 1);
      ggml_set_name(KQ_mask, "KQ_mask");
      ggml_set_input(KQ_mask);
  
@@ -601,12 +601,12 @@ struct ggml_cgraph * gpt2_graph(
          // norm
          {
              // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
+            cur = ggml_norm(ctx, inpL, hparams.eps);
  
              // cur = ln_1_g*cur + ln_1_b
              // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
+            cur = ggml_add(ctx,
+                    ggml_mul(ctx,
                          cur,
                          model.layers[il].ln_1_g),
                      model.layers[il].ln_1_b);
@@ -621,45 +621,45 @@ struct ggml_cgraph * gpt2_graph(
          // cur = attn_w*cur + attn_b
          // [2304, n_tokens]
          {
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                      model.layers[il].c_attn_attn_w,
                      cur);
  
-            cur = ggml_add(ctx0,
+            cur = ggml_add(ctx,
                      cur,
                      model.layers[il].c_attn_attn_b);
          }
  
          // self-attention
          {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*n_embd);
+            struct ggml_tensor * Qcur = ggml_view_2d(ctx, cur, n_embd, n_tokens, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_tensor * Kcur = ggml_view_2d(ctx, cur, n_embd, n_tokens, cur->nb[1], 1*sizeof(float)*n_embd);
+            struct ggml_tensor * Vcur = ggml_view_2d(ctx, cur, n_embd, n_tokens, cur->nb[1], 2*sizeof(float)*n_embd);
  
              // store key and value to memory
              if (n_tokens >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.kv_cache.k, n_tokens*n_embd, (ggml_element_size(model.kv_cache.k)*n_embd)*(il*n_ctx + kv_head));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.kv_cache.v, n_tokens*n_embd, (ggml_element_size(model.kv_cache.v)*n_embd)*(il*n_ctx + kv_head));
+                struct ggml_tensor * k = ggml_view_1d(ctx, model.kv_cache.k, n_tokens*n_embd, (ggml_element_size(model.kv_cache.k)*n_embd)*(il*n_ctx + kv_head));
+                struct ggml_tensor * v = ggml_view_1d(ctx, model.kv_cache.v, n_tokens*n_embd, (ggml_element_size(model.kv_cache.v)*n_embd)*(il*n_ctx + kv_head));
  
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx, Vcur, v));
              }
  
              // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
              // [64, N, 12]
              struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
+                ggml_permute(ctx,
+                        ggml_cont_3d(ctx,
                              Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, n_tokens)),
+                            n_embd/n_head, n_head, n_tokens),
                          0, 2, 1, 3);
  
              // K = Kmem.view(n_embd/n_head, n_head, n_kv).permute(0, 2, 1, 3)
              // [64, n_kv, 12]
              struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.kv_cache.k, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.k)*n_embd),
+                ggml_permute(ctx,
+                        ggml_reshape_3d(ctx,
+                            ggml_view_1d(ctx, model.kv_cache.k, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.k)*n_embd),
                              n_embd/n_head, n_head, n_kv),
                          0, 2, 1, 3);
  
@@ -677,47 +677,45 @@ struct ggml_cgraph * gpt2_graph(
  
              // K * Q
              // [n_kv, n_tokens, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx, K, Q);
  
              // KQ_scaled = KQ / sqrt(n_embd/n_head)
              // [n_kv, n_tokens, 12]
              struct ggml_tensor * KQ_scaled =
-                ggml_scale(ctx0,
+                ggml_scale(ctx,
                          KQ,
                          1.0f/sqrtf(float(n_embd)/n_head));
  
              // KQ_masked = mask_past(KQ_scaled)
              // [n_kv, n_tokens, 12]
-            struct ggml_tensor * KQ_masked = ggml_add(ctx0, KQ_scaled, KQ_mask);
+            struct ggml_tensor * KQ_masked = ggml_add(ctx, KQ_scaled, KQ_mask);
  
              // KQ = soft_max(KQ_masked)
              // [n_kv, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx, KQ_masked);
  
              // V_trans = Vmem.view(n_embd/n_head, n_head, n_kv).permute(1, 2, 0, 3).contiguous()
              // [n_kv, 64, 12]
              struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.kv_cache.v, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.v)*n_embd),
+                ggml_cont_3d(ctx,
+                        ggml_permute(ctx,
+                            ggml_reshape_3d(ctx,
+                                ggml_view_1d(ctx, model.kv_cache.v, n_kv*n_embd, il*n_ctx*ggml_element_size(model.kv_cache.v)*n_embd),
                                  n_embd/n_head, n_head, n_kv),
                              1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.kv_cache.v->type, n_kv, n_embd/n_head, n_head));
+                        n_kv, n_embd/n_head, n_head);
  
              // KQV = transpose(V) * KQ_soft_max
              // [64, n_tokens, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx, V_trans, KQ_soft_max);
  
              // KQV_merged = KQV.permute(0, 2, 1, 3)
              // [64, 12, n_tokens]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx, KQV, 0, 2, 1, 3);
  
              // cur = KQV_merged.contiguous().view(n_embd, N)
              // [768, n_tokens]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, n_tokens));
+            cur = ggml_cont_2d(ctx, KQV_merged, n_embd, n_tokens);
          }
  
          // projection
@@ -729,17 +727,17 @@ struct ggml_cgraph * gpt2_graph(
          // cur = proj_w*cur + proj_b
          // [768, N]
          {
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                      model.layers[il].c_attn_proj_w,
                      cur);
  
-            cur = ggml_add(ctx0,
+            cur = ggml_add(ctx,
                      cur,
                      model.layers[il].c_attn_proj_b);
          }
  
          // add the input
-        cur = ggml_add(ctx0, cur, inpL);
+        cur = ggml_add(ctx, cur, inpL);
  
          struct ggml_tensor * inpFF = cur;
  
@@ -747,12 +745,12 @@ struct ggml_cgraph * gpt2_graph(
          {
              // norm
              {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+                cur = ggml_norm(ctx, inpFF, hparams.eps);
  
                  // cur = ln_2_g*cur + ln_2_b
                  // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
+                cur = ggml_add(ctx,
+                        ggml_mul(ctx,
                              cur,
                              model.layers[il].ln_2_g),
                          model.layers[il].ln_2_b);
@@ -766,17 +764,17 @@ struct ggml_cgraph * gpt2_graph(
              //
              // cur = fc_w*cur + fc_b
              // [3072, N]
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                      model.layers[il].c_mlp_fc_w,
                      cur);
  
-            cur = ggml_add(ctx0,
+            cur = ggml_add(ctx,
                      cur,
                      model.layers[il].c_mlp_fc_b);
  
              // GELU activation
              // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
+            cur = ggml_gelu(ctx, cur);
  
              // projection
              // [ 768, 3072] - model.layers[il].c_mlp_proj_w
@@ -786,28 +784,28 @@ struct ggml_cgraph * gpt2_graph(
              //
              // cur = proj_w*cur + proj_b
              // [768, N]
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                      model.layers[il].c_mlp_proj_w,
                      cur);
  
-            cur = ggml_add(ctx0,
+            cur = ggml_add(ctx,
                      cur,
                      model.layers[il].c_mlp_proj_b);
          }
  
          // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
+        inpL = ggml_add(ctx, cur, inpFF);
      }
  
      // norm
      {
          // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+        inpL = ggml_norm(ctx, inpL, hparams.eps);
  
          // inpL = ln_f_g*inpL + ln_f_b
          // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
+        inpL = ggml_add(ctx,
+                ggml_mul(ctx,
                      inpL,
                      model.ln_f_g),
                  model.ln_f_b);
@@ -816,14 +814,14 @@ struct ggml_cgraph * gpt2_graph(
      // inpL = WTE * inpL
      // [ 768, 50257] - model.lm_head
      // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+    inpL = ggml_mul_mat(ctx, model.lm_head, inpL);
  
      // logits -> probs
      //inpL = ggml_soft_max(ctx0, inpL);
  
      ggml_build_forward_expand(gf, inpL);
  
-    ggml_free(ctx0);
+    ggml_free(ctx);
  
      return gf;
  }
diff --git a/examples/gpt-2/main-sched.cpp b/examples/gpt-2/main-sched.cpp

index b5b8af615066f64d331c43b0bf38f8738fd2dfec..e753d5fb3af5383d5c3195aebf3ac42c4ac7a7f0 100644 (file)
--- a/examples/gpt-2/main-sched.cpp
+++ b/examples/gpt-2/main-sched.cpp
@@ -557,17 +557,17 @@ struct ggml_cgraph * gpt2_graph(
          /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
      };
  
-    struct ggml_context * ctx0 = ggml_init(params);
+    struct ggml_context * ctx = ggml_init(params);
  
-    struct ggml_cgraph  * gf = ggml_new_graph_custom(ctx0, GPT2_MAX_NODES, false);
+    struct ggml_cgraph  * gf = ggml_new_graph_custom(ctx, GPT2_MAX_NODES, false);
  
-    struct ggml_tensor * embd = ggml_view_1d(ctx0, model.embd, N, 0);
+    struct ggml_tensor * embd = ggml_view_1d(ctx, model.embd, N, 0);
  
      // set inputs
      // TODO: move to gpt2_eval
      ggml_backend_tensor_set(model.embd, embd_inp.data(), 0, N*ggml_element_size(embd));
  
-    struct ggml_tensor * position = ggml_view_1d(ctx0, model.position, N, 0);
+    struct ggml_tensor * position = ggml_view_1d(ctx, model.position, N, 0);
      for (int i = 0; i < N; ++i) {
          int32_t v = n_past + i;
          ggml_backend_tensor_set(model.position, &v, i*sizeof(int32_t), sizeof(v));
@@ -577,9 +577,9 @@ struct ggml_cgraph * gpt2_graph(
  
      // wte + wpe
      struct ggml_tensor * inpL =
-        ggml_add(ctx0,
-                ggml_get_rows(ctx0, model.wte, embd),
-                ggml_get_rows(ctx0, model.wpe, position));
+        ggml_add(ctx,
+                ggml_get_rows(ctx, model.wte, embd),
+                ggml_get_rows(ctx, model.wpe, position));
      ggml_set_name(inpL, "inpL");
      ggml_set_name(inpL->src[0], "wte");
      ggml_set_name(inpL->src[1], "wpe");
@@ -590,13 +590,13 @@ struct ggml_cgraph * gpt2_graph(
          // norm
          {
              // [ 768, N]
-            cur = ggml_norm(ctx0, inpL, hparams.eps);
+            cur = ggml_norm(ctx, inpL, hparams.eps);
              ggml_format_name(cur, "l%d.norm", il);
  
              // cur = ln_1_g*cur + ln_1_b
              // [ 768, N]
-            cur = ggml_add(ctx0,
-                    ggml_mul(ctx0,
+            cur = ggml_add(ctx,
+                    ggml_mul(ctx,
                          cur,
                          model.layers[il].ln_1_g),
                      model.layers[il].ln_1_b);
@@ -613,12 +613,12 @@ struct ggml_cgraph * gpt2_graph(
          // cur = attn_w*cur + attn_b
          // [2304, N]
          {
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                      model.layers[il].c_attn_attn_w,
                      cur);
              ggml_format_name(cur, "l%d.attn_w", il);
  
-            cur = ggml_add(ctx0,
+            cur = ggml_add(ctx,
                      cur,
                      model.layers[il].c_attn_attn_b);
              ggml_format_name(cur, "l%d.attn_b", il);
@@ -626,9 +626,9 @@ struct ggml_cgraph * gpt2_graph(
  
          // self-attention
          {
-            struct ggml_tensor * Qcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
-            struct ggml_tensor * Kcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
-            struct ggml_tensor * Vcur = ggml_view_2d(ctx0, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
+            struct ggml_tensor * Qcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 0*sizeof(float)*n_embd);
+            struct ggml_tensor * Kcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 1*sizeof(float)*n_embd);
+            struct ggml_tensor * Vcur = ggml_view_2d(ctx, cur, n_embd, N, cur->nb[1], 2*sizeof(float)*n_embd);
  
              ggml_format_name(Qcur, "l%d.Qcur", il);
              ggml_format_name(Kcur, "l%d.Kcur", il);
@@ -636,29 +636,27 @@ struct ggml_cgraph * gpt2_graph(
  
              // store key and value to memory
              if (N >= 1) {
-                struct ggml_tensor * k = ggml_view_1d(ctx0, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
-                struct ggml_tensor * v = ggml_view_1d(ctx0, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * k = ggml_view_1d(ctx, model.memory_k, N*n_embd, (ggml_element_size(model.memory_k)*n_embd)*(il*n_ctx + n_past));
+                struct ggml_tensor * v = ggml_view_1d(ctx, model.memory_v, N*n_embd, (ggml_element_size(model.memory_v)*n_embd)*(il*n_ctx + n_past));
  
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
-                ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx, Kcur, k));
+                ggml_build_forward_expand(gf, ggml_cpy(ctx, Vcur, v));
              }
  
              // Q = Qcur.contiguous().view(n_embd/n_head, n_head, N).permute(0, 2, 1, 3)
              // [64, N, 12]
              struct ggml_tensor * Q =
-                ggml_permute(ctx0,
-                        ggml_cpy(ctx0,
-                            Qcur,
-                            ggml_new_tensor_3d(ctx0, GGML_TYPE_F32, n_embd/n_head, n_head, N)),
+                ggml_permute(ctx,
+                        ggml_cont_3d(ctx, Qcur, n_embd/n_head, n_head, N),
                          0, 2, 1, 3);
              ggml_format_name(Q, "l%d.Q", il);
  
              // K = Kmem.view(n_embd/n_head, n_head, n_past + N).permute(0, 2, 1, 3)
              // [64, n_past + N, 12]
              struct ggml_tensor * K =
-                ggml_permute(ctx0,
-                        ggml_reshape_3d(ctx0,
-                            ggml_view_1d(ctx0, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
+                ggml_permute(ctx,
+                        ggml_reshape_3d(ctx,
+                            ggml_view_1d(ctx, model.memory_k, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_k)*n_embd),
                              n_embd/n_head, n_head, n_past + N),
                          0, 2, 1, 3);
              ggml_format_name(K, "l%d.K", il);
@@ -677,51 +675,48 @@ struct ggml_cgraph * gpt2_graph(
  
              // K * Q
              // [n_past + N, N, 12]
-            struct ggml_tensor * KQ = ggml_mul_mat(ctx0, K, Q);
+            struct ggml_tensor * KQ = ggml_mul_mat(ctx, K, Q);
              ggml_format_name(KQ, "l%d.KQ", il);
  
              // KQ_scaled = KQ / sqrt(n_embd/n_head)
              // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_scaled = ggml_scale(ctx0, KQ, KQ_scale);
+            struct ggml_tensor * KQ_scaled = ggml_scale(ctx, KQ, KQ_scale);
              ggml_format_name(KQ_scaled, "l%d.KQ_scaled", il);
  
              // KQ_masked = mask_past(KQ_scaled)
              // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx0, KQ_scaled, n_past);
+            struct ggml_tensor * KQ_masked = ggml_diag_mask_inf(ctx, KQ_scaled, n_past);
              ggml_format_name(KQ_masked, "l%d.KQ_masked", il);
  
              // KQ = soft_max(KQ_masked)
              // [n_past + N, N, 12]
-            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx0, KQ_masked);
+            struct ggml_tensor * KQ_soft_max = ggml_soft_max(ctx, KQ_masked);
              ggml_format_name(KQ_soft_max, "l%d.KQ_soft_max", il);
  
              // V_trans = Vmem.view(n_embd/n_head, n_head, n_past + N).permute(1, 2, 0, 3).contiguous()
              // [n_past + N, 64, 12]
              struct ggml_tensor * V_trans =
-                ggml_cpy(ctx0,
-                        ggml_permute(ctx0,
-                            ggml_reshape_3d(ctx0,
-                                ggml_view_1d(ctx0, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
+                ggml_cont_3d(ctx,
+                        ggml_permute(ctx,
+                            ggml_reshape_3d(ctx,
+                                ggml_view_1d(ctx, model.memory_v, (n_past + N)*n_embd, il*n_ctx*ggml_element_size(model.memory_v)*n_embd),
                                  n_embd/n_head, n_head, n_past + N),
                              1, 2, 0, 3),
-                        ggml_new_tensor_3d(ctx0, model.memory_v->type, n_past + N, n_embd/n_head, n_head));
-            ggml_format_name(V_trans, "l%d.V_trans", il);
+                        n_past + N, n_embd/n_head, n_head);
  
              // KQV = transpose(V) * KQ_soft_max
              // [64, N, 12]
-            struct ggml_tensor * KQV = ggml_mul_mat(ctx0, V_trans, KQ_soft_max);
+            struct ggml_tensor * KQV = ggml_mul_mat(ctx, V_trans, KQ_soft_max);
              ggml_format_name(KQV, "l%d.KQV", il);
  
              // KQV_merged = KQV.permute(0, 2, 1, 3)
              // [64, 12, N]
-            struct ggml_tensor * KQV_merged = ggml_permute(ctx0, KQV, 0, 2, 1, 3);
+            struct ggml_tensor * KQV_merged = ggml_permute(ctx, KQV, 0, 2, 1, 3);
              ggml_format_name(KQV_merged, "l%d.KQV_merged", il);
  
              // cur = KQV_merged.contiguous().view(n_embd, N)
              // [768, N]
-            cur = ggml_cpy(ctx0,
-                    KQV_merged,
-                    ggml_new_tensor_2d(ctx0, GGML_TYPE_F32, n_embd, N));
+            cur = ggml_cont_2d(ctx, KQV_merged, n_embd, N);
              ggml_format_name(cur, "l%d.KQV_merged_contiguous", il);
          }
  
@@ -734,19 +729,19 @@ struct ggml_cgraph * gpt2_graph(
          // cur = proj_w*cur + proj_b
          // [768, N]
          {
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                      model.layers[il].c_attn_proj_w,
                      cur);
              ggml_format_name(cur, "l%d.attn_proj_w", il);
  
-            cur = ggml_add(ctx0,
+            cur = ggml_add(ctx,
                      cur,
                      model.layers[il].c_attn_proj_b);
              ggml_format_name(cur, "l%d.attn_proj_b", il);
          }
  
          // add the input
-        cur = ggml_add(ctx0, cur, inpL);
+        cur = ggml_add(ctx, cur, inpL);
          ggml_format_name(cur, "l%d.add", il);
  
          struct ggml_tensor * inpFF = cur;
@@ -755,13 +750,13 @@ struct ggml_cgraph * gpt2_graph(
          {
              // norm
              {
-                cur = ggml_norm(ctx0, inpFF, hparams.eps);
+                cur = ggml_norm(ctx, inpFF, hparams.eps);
                  ggml_format_name(cur, "l%d.FFnorm", il);
  
                  // cur = ln_2_g*cur + ln_2_b
                  // [ 768, N]
-                cur = ggml_add(ctx0,
-                        ggml_mul(ctx0,
+                cur = ggml_add(ctx,
+                        ggml_mul(ctx,
                              cur,
                              model.layers[il].ln_2_g),
                          model.layers[il].ln_2_b);
@@ -777,19 +772,19 @@ struct ggml_cgraph * gpt2_graph(
              //
              // cur = fc_w*cur + fc_b
              // [3072, N]
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                      model.layers[il].c_mlp_fc_w,
                      cur);
              ggml_format_name(cur, "l%d.mlp_fc_w", il);
  
-            cur = ggml_add(ctx0,
+            cur = ggml_add(ctx,
                      cur,
                      model.layers[il].c_mlp_fc_b);
              ggml_format_name(cur, "l%d.mlp_fc_b", il);
  
              // GELU activation
              // [3072, N]
-            cur = ggml_gelu(ctx0, cur);
+            cur = ggml_gelu(ctx, cur);
              ggml_format_name(cur, "l%d.gelu", il);
  
              // projection
@@ -800,32 +795,32 @@ struct ggml_cgraph * gpt2_graph(
              //
              // cur = proj_w*cur + proj_b
              // [768, N]
-            cur = ggml_mul_mat(ctx0,
+            cur = ggml_mul_mat(ctx,
                      model.layers[il].c_mlp_proj_w,
                      cur);
              ggml_format_name(cur, "l%d.mlp_proj_w", il);
  
-            cur = ggml_add(ctx0,
+            cur = ggml_add(ctx,
                      cur,
                      model.layers[il].c_mlp_proj_b);
              ggml_format_name(cur, "l%d.mlp_proj_b", il);
          }
  
          // input for next layer
-        inpL = ggml_add(ctx0, cur, inpFF);
+        inpL = ggml_add(ctx, cur, inpFF);
          ggml_format_name(inpL, "l%d.add2", il);
      }
  
      // norm
      {
          // [ 768, N]
-        inpL = ggml_norm(ctx0, inpL, hparams.eps);
+        inpL = ggml_norm(ctx, inpL, hparams.eps);
          ggml_format_name(inpL, "out_norm");
  
          // inpL = ln_f_g*inpL + ln_f_b
          // [ 768, N]
-        inpL = ggml_add(ctx0,
-                ggml_mul(ctx0,
+        inpL = ggml_add(ctx,
+                ggml_mul(ctx,
                      inpL,
                      model.ln_f_g),
                  model.ln_f_b);
@@ -836,7 +831,7 @@ struct ggml_cgraph * gpt2_graph(
      // inpL = WTE * inpL
      // [ 768, 50257] - model.lm_head
      // [ 768, N]     - inpL
-    inpL = ggml_mul_mat(ctx0, model.lm_head, inpL);
+    inpL = ggml_mul_mat(ctx, model.lm_head, inpL);
      ggml_format_name(inpL, "out_lm_head");
  
      // logits -> probs
@@ -844,7 +839,7 @@ struct ggml_cgraph * gpt2_graph(
  
      ggml_build_forward_expand(gf, inpL);
  
-    ggml_free(ctx0);
+    ggml_free(ctx);
  
      return gf;
  }
diff --git a/src/ggml-alloc.c b/src/ggml-alloc.c

index c28c37c4fd9ffb52bdecd686f547fd43768773ba..56d596693e088065142f7a0bb0c0d343b3cf24a5 100644 (file)
--- a/src/ggml-alloc.c
+++ b/src/ggml-alloc.c
@@ -377,6 +377,9 @@ struct ggml_gallocr {
  
      struct node_alloc * node_allocs; // [n_nodes]
      int n_nodes;
+
+    struct tensor_alloc * leaf_allocs; // [n_leafs]
+    int n_leafs;
  };
  
  ggml_gallocr_t ggml_gallocr_new_n(ggml_backend_buffer_type_t * bufts, int n_bufs) {
@@ -427,6 +430,7 @@ void ggml_gallocr_free(ggml_gallocr_t galloc) {
      free(galloc->buffers);
      free(galloc->buf_tallocs);
      free(galloc->node_allocs);
+    free(galloc->leaf_allocs);
      free(galloc);
  }
  
@@ -544,22 +548,8 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
      memset(galloc->hash_set.keys, 0, galloc->hash_set.size * sizeof(struct ggml_tensor *));
      memset(galloc->hash_values,   0, galloc->hash_set.size * sizeof(struct hash_node));
  
-    // allocate all graph inputs first to avoid overwriting them
-    for (int i = 0; i < graph->n_nodes; i++) {
-        if (graph->nodes[i]->flags & GGML_TENSOR_FLAG_INPUT) {
-            ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
-        }
-        for (int j = 0; j < GGML_MAX_SRC; j++) {
-            if (graph->nodes[i]->src[j] == NULL) {
-                break;
-            }
-            if (graph->nodes[i]->src[j]->flags & GGML_TENSOR_FLAG_INPUT) {
-                ggml_gallocr_allocate_node(galloc, graph->nodes[i]->src[j], get_node_buffer_id(node_buffer_ids, i));
-            }
-        }
-    }
-
      // count number of children and views
+    // allocate all graph inputs and leafs first to avoid overwriting them
      for (int i = 0; i < graph->n_nodes; i++) {
          struct ggml_tensor * node = graph->nodes[i];
  
@@ -568,14 +558,37 @@ static void ggml_gallocr_alloc_graph_impl(ggml_gallocr_t galloc, struct ggml_cgr
              ggml_gallocr_hash_get(galloc, view_src)->n_views += 1;
          }
  
+        if (node->flags & GGML_TENSOR_FLAG_INPUT) {
+            ggml_gallocr_allocate_node(galloc, graph->nodes[i], get_node_buffer_id(node_buffer_ids, i));
+        }
+
          for (int j = 0; j < GGML_MAX_SRC; j++) {
-            struct ggml_tensor * parent = node->src[j];
-            if (parent == NULL) {
+            struct ggml_tensor * src = node->src[j];
+            if (src == NULL) {
                  break;
              }
-            ggml_gallocr_hash_get(galloc, parent)->n_children += 1;
+
+            ggml_gallocr_hash_get(galloc, src)->n_children += 1;
+
+            // allocate explicit inputs and leafs
+            if (src->flags & GGML_TENSOR_FLAG_INPUT || src->op == GGML_OP_NONE) {
+                ggml_gallocr_allocate_node(galloc, src, get_node_buffer_id(node_buffer_ids, i));
+            }
          }
-   }
+    }
+
+    // allocate the remaining leafs that are unused on the graph
+    // these are effectively static tensors that the application is not using in the graph, but may still want to allocate for other purposes
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct ggml_tensor * leaf = graph->leafs[i];
+        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
+
+        if (hn->n_children == 0) {
+            assert(!hn->allocated);
+            // since buffer ids are only given for nodes, these leafs are always allocated in the first buffer
+            ggml_gallocr_allocate_node(galloc, leaf, 0);
+        }
+    }
  
      // allocate tensors
      for (int i = 0; i < graph->n_nodes; i++) {
@@ -696,6 +709,18 @@ bool ggml_gallocr_reserve_n(ggml_gallocr_t galloc, struct ggml_cgraph * graph, c
              }
          }
      }
+    if (galloc->n_leafs < graph->n_leafs) {
+        free(galloc->leaf_allocs);
+        galloc->leaf_allocs = calloc(sizeof(struct tensor_alloc), graph->n_leafs);
+        GGML_ASSERT(galloc->leaf_allocs != NULL);
+    }
+    galloc->n_leafs = graph->n_leafs;
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct ggml_tensor * leaf = graph->leafs[i];
+        struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
+        galloc->leaf_allocs[i].offset = hn->offset;
+        galloc->leaf_allocs[i].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
+    }
  
      // reallocate buffers if needed
      for (int i = 0; i < galloc->n_buffers; i++) {
@@ -722,8 +747,8 @@ bool ggml_gallocr_reserve(ggml_gallocr_t galloc, struct ggml_cgraph *graph) {
      return ggml_gallocr_reserve_n(galloc, graph, NULL);
  }
  
-static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * node_alloc, struct tensor_alloc * tensor_alloc) {
-    assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
+static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id, struct tensor_alloc * tensor_alloc) {
+    assert(node->data || node->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
  
      if (node->view_src != NULL) {
          if (node->buffer == NULL) {
@@ -732,29 +757,20 @@ static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor *
                  // this tensor was allocated without ggml-backend
                  return;
              }
-            ggml_backend_view_init(galloc->buffers[node_alloc->buffer_id], node);
+            ggml_backend_view_init(galloc->buffers[buffer_id], node);
          }
      } else {
          if (node->data == NULL) {
              assert(tensor_alloc->offset != SIZE_MAX);
-            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[node_alloc->buffer_id], node) <= tensor_alloc->size_max);
-            void * base = ggml_backend_buffer_get_base(galloc->buffers[node_alloc->buffer_id]);
+            assert(ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], node) <= tensor_alloc->size_max);
+            void * base = ggml_backend_buffer_get_base(galloc->buffers[buffer_id]);
              void * addr = (char *)base + tensor_alloc->offset;
-            ggml_backend_tensor_alloc(galloc->buffers[node_alloc->buffer_id], node, addr);
+            ggml_backend_tensor_alloc(galloc->buffers[buffer_id], node, addr);
          } else {
              if (node->buffer == NULL) {
                  // this tensor was allocated without ggml-backend
                  return;
              }
-
-#ifndef NDEBUG
-            size_t offset =
-                (char *)node->data -
-                (char *)ggml_backend_buffer_get_base(node->buffer);
-            size_t size = ggml_backend_buffer_get_alloc_size(node->buffer, node);
-            assert(tensor_alloc->offset == SIZE_MAX || offset == tensor_alloc->offset);
-            assert(tensor_alloc->offset == SIZE_MAX || size <= tensor_alloc->size_max);
-#endif
          }
      }
  }
@@ -773,6 +789,13 @@ static bool ggml_gallocr_needs_realloc(ggml_gallocr_t galloc, struct ggml_cgraph
          return true;
      }
  
+    if (galloc->n_leafs != graph->n_leafs) {
+#ifndef NDEBUG
+        fprintf(stderr, "%s: graph has different number of leafs\n", __func__);
+#endif
+        return true;
+    }
+
      for (int i = 0; i < graph->n_nodes; i++) {
          struct ggml_tensor * node = graph->nodes[i];
          struct node_alloc * node_alloc = &galloc->node_allocs[i];
@@ -827,6 +850,7 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
      }
  
      // allocate the graph tensors from the previous assignments
+    // nodes
      for (int i = 0; i < graph->n_nodes; i++) {
          struct ggml_tensor * node = graph->nodes[i];
          struct node_alloc * node_alloc = &galloc->node_allocs[i];
@@ -835,9 +859,15 @@ bool ggml_gallocr_alloc_graph(ggml_gallocr_t galloc, struct ggml_cgraph * graph)
              if (src == NULL) {
                  break;
              }
-            ggml_gallocr_init_tensor(galloc, src, node_alloc, &node_alloc->src[j]);
+            ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
          }
-        ggml_gallocr_init_tensor(galloc, node, node_alloc, &node_alloc->dst);
+        ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
+    }
+    // leafs
+    for (int i = 0; i < graph->n_leafs; i++) {
+        struct ggml_tensor * leaf = graph->leafs[i];
+        struct tensor_alloc * leaf_alloc = &galloc->leaf_allocs[i];
+        ggml_gallocr_init_tensor(galloc, leaf, 0, leaf_alloc);
      }
  
      return true;
author	slaren <redacted>
	Mon, 12 Feb 2024 17:07:14 +0000 (18:07 +0100)
committer	GitHub <redacted>
	Mon, 12 Feb 2024 17:07:14 +0000 (19:07 +0200)
examples/gpt-2/main-alloc.cpp		patch \| blob \| history
examples/gpt-2/main-backend.cpp		patch \| blob \| history
examples/gpt-2/main-batched.cpp		patch \| blob \| history
examples/gpt-2/main-sched.cpp		patch \| blob \| history
src/ggml-alloc.c		patch \| blob \| history