ggml : hide ggml_object, ggml_cgraph, ggml_hash_set (#9408)

author Georgi Gerganov <redacted>

Thu, 12 Sep 2024 11:23:49 +0000 (14:23 +0300)

committer GitHub <redacted>

Thu, 12 Sep 2024 11:23:49 +0000 (14:23 +0300)
author Georgi Gerganov <redacted>
Thu, 12 Sep 2024 11:23:49 +0000 (14:23 +0300)
committer GitHub <redacted>
Thu, 12 Sep 2024 11:23:49 +0000 (14:23 +0300)
diff --git a/examples/benchmark/benchmark-matmult.cpp b/examples/benchmark/benchmark-matmult.cpp

index 97622f4f4fd185ad7548ce3b19e31beda10a839c..922daf52849b5f3223d37b743736efc0dff871b2 100644 (file)
--- a/examples/benchmark/benchmark-matmult.cpp
+++ b/examples/benchmark/benchmark-matmult.cpp
@@ -183,7 +183,7 @@ int main(int argc, char ** argv)  {
  
      ggml_graph_compute_helper(work_buffer, gf, benchmark_params.n_threads);
  
-    TENSOR_DUMP(gf->nodes[0]);
+    TENSOR_DUMP(ggml_graph_node(gf, 0));
  
      printf("\n------ Test 2 - Matrix Mult via %s code\n", ggml_type_name(qtype));
  
@@ -224,7 +224,7 @@ int main(int argc, char ** argv)  {
  
  
      // Let's use the F32 result from above as a reference for the quantized multiplication
-    float sum_of_F32_reference = tensor_sum_elements(gf->nodes[0]);
+    float sum_of_F32_reference = tensor_sum_elements(ggml_graph_node(gf, 0));
  
      printf("Iteration;NThreads; SizeX; SizeY; SizeZ; Required_FLOPS; Elapsed_u_Seconds; gigaFLOPS\n");
      printf("=====================================================================================\n");
@@ -252,7 +252,7 @@ int main(int argc, char ** argv)  {
  
          // Check that the matrix multiplication result is in the right ballpark
          // We cannot use the exact value from the F32 multiplication because the quantizuation will be slightly different
-        float sum_of_Q4_result = tensor_sum_elements(gf31->nodes[0]);
+        float sum_of_Q4_result = tensor_sum_elements(ggml_graph_node(gf31, 0));
          float delta = std::abs(sum_of_Q4_result - sum_of_F32_reference);
          float allowed_delta = (sum_of_F32_reference) / 1000 / 1000; //  Let's accept an epsilon of 10^-6
  
diff --git a/examples/cvector-generator/pca.hpp b/examples/cvector-generator/pca.hpp

index 05c66856ca1079d77f7a573ccc268824bcc19c90..a969c486dc42f34aeff8bc494474895585890086 100644 (file)
--- a/examples/cvector-generator/pca.hpp
+++ b/examples/cvector-generator/pca.hpp
@@ -226,8 +226,8 @@ static ggml_status compute_piter(
          result.eigenvectors.resize(params.n_batch);
          result.distances.resize(params.n_batch);
          // get output nodes
-        for (int i = 0; i < gf->n_nodes; ++i) {
-            auto node = gf->nodes[i];
+        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+            auto node = ggml_graph_node(gf, i);
              int iter = -1;
              // find b_tensor (without copying data from device)
              if ((iter = extract_i("b_tensor_norm_", node->name)) > -1) {
diff --git a/examples/export-lora/export-lora.cpp b/examples/export-lora/export-lora.cpp

index ff324926a05e1e670b1a6086cfc25105a3121b37..90126ad1e9075b31b26b85b72e52f5207be8e36b 100644 (file)
--- a/examples/export-lora/export-lora.cpp
+++ b/examples/export-lora/export-lora.cpp
@@ -370,7 +370,7 @@ struct lora_merge_ctx {
  
          // write data to output file
          {
-            auto result = gf->nodes[gf->n_nodes - 1];
+            auto * result = ggml_graph_node(gf, -1);
              size_t len = ggml_nbytes(result);
              if (read_buf.size() < len) {
                  read_buf.resize(len);
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp

index 9b890571eee9c82bda96783f069dde92b28d2d6f..5dfb333d1be8c3f27b37f59ecd7bae7ac1155120 100644 (file)
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2449,7 +2449,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
      ggml_backend_graph_compute(ctx->backend, gf);
  
      // the last node is the embedding tensor
-    struct ggml_tensor * embeddings = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor * embeddings = ggml_graph_node(gf, -1);
  
      // copy the embeddings to the location passed by the user
      ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
diff --git a/examples/llava/llava.cpp b/examples/llava/llava.cpp

index 851af0f004a691902625a0bec11f4cbd490e5340..e162586ed88d23368bc4cc2722dfe56a048c7424 100644 (file)
--- a/examples/llava/llava.cpp
+++ b/examples/llava/llava.cpp
@@ -184,7 +184,7 @@ static bool clip_llava_handle_patches(clip_ctx * ctx_clip, std::vector<float *>
      // ggml_tensor_printf(flatten,"flatten",__LINE__,false,false);
      ggml_build_forward_expand(gf, flatten);
      ggml_graph_compute_with_ctx(model.ctx, gf, 1);
-    struct ggml_tensor* result = gf->nodes[gf->n_nodes - 1];
+    struct ggml_tensor* result = ggml_graph_node(gf, -1);
  
      memcpy(image_embd_out, image_embd_v[0], clip_embd_nbytes(ctx_clip)); // main image as global context
      // append without newline tokens (default behavior in llava_arch when not using unpad ):
diff --git a/ggml/include/ggml.h b/ggml/include/ggml.h

index 536018b669d3d7646ee67f88966427ce7ee7569e..86ad6fb6224d5113b8e9a38ddf50fdefcaf86617 100644 (file)
--- a/ggml/include/ggml.h
+++ b/ggml/include/ggml.h
@@ -358,6 +358,7 @@ extern "C" {
  
      struct ggml_object;
      struct ggml_context;
+    struct ggml_cgraph;
  
      // NOTE: always add types at the end of the enum to keep backward compatibility
      enum ggml_type {
@@ -575,23 +576,9 @@ extern "C" {
          GGML_TENSOR_FLAG_PARAM  = 4,
      };
  
-    // ggml object
-    struct ggml_object {
-        size_t offs;
-        size_t size;
-
-        struct ggml_object * next;
-
-        enum ggml_object_type type;
-
-        char padding[4];
-    };
-
-    static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
-
      // n-dimensional tensor
      struct ggml_tensor {
-        enum ggml_type         type;
+        enum ggml_type type;
  
          GGML_DEPRECATED(enum ggml_backend_type backend, "use the buffer type to find the storage location of the tensor");
  
@@ -655,7 +642,7 @@ extern "C" {
  
      struct ggml_threadpool;     // forward declaration, see ggml.c
  
-    typedef struct  ggml_threadpool * ggml_threadpool_t;
+    typedef struct ggml_threadpool * ggml_threadpool_t;
  
      // the compute plan that needs to be prepared for ggml_graph_compute()
      // since https://github.com/ggerganov/ggml/issues/287
@@ -671,35 +658,6 @@ extern "C" {
          void *              abort_callback_data;
      };
  
-    enum ggml_cgraph_eval_order {
-        GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
-        GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
-        GGML_CGRAPH_EVAL_ORDER_COUNT
-    };
-
-    typedef uint32_t ggml_bitset_t;
-
-    struct ggml_hash_set {
-        size_t size;
-        ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
-        struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
-    };
-
-    // computation graph
-    struct ggml_cgraph {
-        int size;
-        int n_nodes;
-        int n_leafs;
-
-        struct ggml_tensor ** nodes;
-        struct ggml_tensor ** grads;
-        struct ggml_tensor ** leafs;
-
-        struct ggml_hash_set visited_hash_set;
-
-        enum ggml_cgraph_eval_order order;
-    };
-
      // scratch buffer
      struct ggml_scratch {
          size_t offs;
@@ -2017,8 +1975,6 @@ extern "C" {
      typedef void (*ggml_custom2_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, int ith, int nth, void * userdata);
      typedef void (*ggml_custom3_op_t)(struct ggml_tensor * dst , const struct ggml_tensor * a, const struct ggml_tensor * b, const struct ggml_tensor * c, int ith, int nth, void * userdata);
  
-    #define GGML_N_TASKS_MAX -1
-
      GGML_API struct ggml_tensor * ggml_map_custom1(
              struct ggml_context   * ctx,
              struct ggml_tensor    * a,
@@ -2088,30 +2044,35 @@ extern "C" {
              struct ggml_context * ctx,
              struct ggml_tensor  * tensor);
  
-
      GGML_API void ggml_build_forward_expand (struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
      GGML_API void ggml_build_backward_expand(struct ggml_context * ctx, struct ggml_cgraph * gf, struct ggml_cgraph * gb, bool keep);
  
      // graph allocation in a context
-    GGML_API struct ggml_cgraph * ggml_new_graph         (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
-    GGML_API struct ggml_cgraph * ggml_new_graph_custom  (struct ggml_context * ctx, size_t size, bool grads);
-    GGML_API struct ggml_cgraph * ggml_graph_dup         (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
-    GGML_API struct ggml_cgraph   ggml_graph_view        (struct ggml_cgraph * cgraph, int i0, int i1);
-    GGML_API void                 ggml_graph_cpy         (struct ggml_cgraph * src, struct ggml_cgraph * dst);
-    GGML_API void                 ggml_graph_reset       (struct ggml_cgraph * cgraph);  // zero grads
-    GGML_API void                 ggml_graph_clear       (struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_cgraph * ggml_new_graph       (struct ggml_context * ctx); // size = GGML_DEFAULT_GRAPH_SIZE, grads = false
+    GGML_API struct ggml_cgraph * ggml_new_graph_custom(struct ggml_context * ctx, size_t size, bool grads);
+    GGML_API struct ggml_cgraph * ggml_graph_dup       (struct ggml_context * ctx, struct ggml_cgraph * cgraph);
+    GGML_API void                 ggml_graph_cpy       (struct ggml_cgraph * src, struct ggml_cgraph * dst);
+    GGML_API void                 ggml_graph_reset     (struct ggml_cgraph * cgraph);  // zero grads
+    GGML_API void                 ggml_graph_clear     (struct ggml_cgraph * cgraph);
+
+    GGML_API int                   ggml_graph_size   (struct ggml_cgraph * cgraph);
+    GGML_API struct ggml_tensor *  ggml_graph_node   (struct ggml_cgraph * cgraph, int i); // if i < 0, returns nodes[n_nodes + i]
+    GGML_API struct ggml_tensor ** ggml_graph_nodes  (struct ggml_cgraph * cgraph);
+    GGML_API int                   ggml_graph_n_nodes(struct ggml_cgraph * cgraph);
+
+    GGML_API void   ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor);
  
      GGML_API size_t ggml_graph_overhead(void);
      GGML_API size_t ggml_graph_overhead_custom(size_t size, bool grads);
  
-    GGML_API struct ggml_threadpool_params   ggml_threadpool_params_default(int n_threads);
-    GGML_API void                            ggml_threadpool_params_init  (struct ggml_threadpool_params *p, int n_threads);
-    GGML_API bool                            ggml_threadpool_params_match (const struct ggml_threadpool_params *p0, const struct ggml_threadpool_params *p1);
-    GGML_API struct ggml_threadpool*         ggml_threadpool_new          (struct ggml_threadpool_params  * params);
-    GGML_API void                            ggml_threadpool_free         (struct ggml_threadpool * threadpool);
-    GGML_API int                             ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
-    GGML_API void                            ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
-    GGML_API void                            ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
+    GGML_API struct ggml_threadpool_params ggml_threadpool_params_default(int n_threads);
+    GGML_API void                          ggml_threadpool_params_init   (struct ggml_threadpool_params * p, int n_threads);
+    GGML_API bool                          ggml_threadpool_params_match  (const struct ggml_threadpool_params * p0, const struct ggml_threadpool_params * p1);
+    GGML_API struct ggml_threadpool *      ggml_threadpool_new          (struct ggml_threadpool_params  * params);
+    GGML_API void                          ggml_threadpool_free         (struct ggml_threadpool * threadpool);
+    GGML_API int                           ggml_threadpool_get_n_threads(struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_pause        (struct ggml_threadpool * threadpool);
+    GGML_API void                          ggml_threadpool_resume       (struct ggml_threadpool * threadpool);
  
      // ggml_graph_plan() has to be called before ggml_graph_compute()
      // when plan.work_size > 0, caller must allocate memory for plan.work_data
diff --git a/ggml/src/ggml-blas.cpp b/ggml/src/ggml-blas.cpp

index 71373173598c7743cb6e2cf52c1f844dab8d0b39..6d99c6beaeeeaf1bec5b72df497105431e480f6c 100644 (file)
--- a/ggml/src/ggml-blas.cpp
+++ b/ggml/src/ggml-blas.cpp
@@ -1,3 +1,4 @@
+#include "ggml-impl.h"
  #include "ggml-blas.h"
  #include "ggml-backend-impl.h"
  
diff --git a/ggml/src/ggml-cann.cpp b/ggml/src/ggml-cann.cpp

index 24b8b752c957003c5cefe2dcb407dcd34673204e..e9c370b9b1b8da4410bebf540de8acf1eed436cd 100644 (file)
--- a/ggml/src/ggml-cann.cpp
+++ b/ggml/src/ggml-cann.cpp
@@ -30,6 +30,7 @@
  #include <cstring>
  #include <mutex>
  
+#include "ggml-impl.h"
  #include "ggml-backend-impl.h"
  #include "ggml-cann/aclnn_ops.h"
  #include "ggml-cann/common.h"
diff --git a/ggml/src/ggml-cuda.cu b/ggml/src/ggml-cuda.cu

index d53de4edd8098d1ff26f5d5b741e858d2350d022..54f1a7c2d3075a9e41aa3327360c1378cc91f1f7 100644 (file)
--- a/ggml/src/ggml-cuda.cu
+++ b/ggml/src/ggml-cuda.cu
@@ -1,5 +1,5 @@
  #include "ggml-cuda.h"
-#include "ggml.h"
+#include "ggml-impl.h"
  #include "ggml-backend-impl.h"
  
  #include "ggml-cuda/common.cuh"
diff --git a/ggml/src/ggml-impl.h b/ggml/src/ggml-impl.h

index 961f3c67bdbd924ea98a2dc6e3031c925209641f..cb7f7728bd98abfe518cd57e0e11cafed8cd191e 100644 (file)
--- a/ggml/src/ggml-impl.h
+++ b/ggml/src/ggml-impl.h
@@ -629,8 +629,16 @@ inline static float ggml_lookup_fp16_to_fp32(ggml_fp16_t f) {
  #define GGML_FP32_TO_FP16(x) GGML_COMPUTE_FP32_TO_FP16(x)
  #endif
  
+enum ggml_cgraph_eval_order {
+    GGML_CGRAPH_EVAL_ORDER_LEFT_TO_RIGHT = 0,
+    GGML_CGRAPH_EVAL_ORDER_RIGHT_TO_LEFT,
+    GGML_CGRAPH_EVAL_ORDER_COUNT
+};
+
  // bitset
  
+typedef uint32_t ggml_bitset_t;
+
  static_assert(sizeof(ggml_bitset_t) == 4, "bitset_t constants must be updated");
  #define BITSET_SHR 5 // log2(sizeof(ggml_bitset_t)*8)
  #define BITSET_MASK (sizeof(ggml_bitset_t)*8 - 1)
@@ -656,6 +664,12 @@ static inline void ggml_bitset_clear(ggml_bitset_t * bitset, size_t i) {
  #define GGML_HASHSET_FULL ((size_t)-1)
  #define GGML_HASHSET_ALREADY_EXISTS ((size_t)-2)
  
+struct ggml_hash_set {
+    size_t size;
+    ggml_bitset_t * used;       // whether or not the keys are in use i.e. set
+    struct ggml_tensor ** keys; // actual tensors in the set, keys[i] is only defined if ggml_bitset_get(used, i)
+};
+
  struct ggml_hash_set ggml_hash_set_new(size_t size);
  void                 ggml_hash_set_free(struct ggml_hash_set * hash_set);
  
@@ -745,6 +759,24 @@ static size_t ggml_hash_find_or_insert(struct ggml_hash_set * hash_set, struct g
      GGML_ABORT("fatal error");
  }
  
+// computation graph
+
+struct ggml_cgraph {
+    int size;
+    int n_nodes;
+    int n_leafs;
+
+    struct ggml_tensor ** nodes;
+    struct ggml_tensor ** grads;
+    struct ggml_tensor ** leafs;
+
+    struct ggml_hash_set visited_hash_set;
+
+    enum ggml_cgraph_eval_order order;
+};
+
+struct ggml_cgraph ggml_graph_view(struct ggml_cgraph * cgraph, int i0, int i1);
+
  #ifdef __cplusplus
  }
  #endif
diff --git a/ggml/src/ggml-kompute.cpp b/ggml/src/ggml-kompute.cpp

index 41ac63fa48e0fadc7565c409c9cffbf654804c59..7f0bd82d5de92db1651fd720200c7f2e6a50559f 100644 (file)
--- a/ggml/src/ggml-kompute.cpp
+++ b/ggml/src/ggml-kompute.cpp
@@ -1,4 +1,4 @@
-#include "ggml.h"
+#include "ggml-impl.h"
  #include "ggml-backend.h"
  #include "ggml-backend-impl.h"
  #include "ggml-kompute.h"
diff --git a/ggml/src/ggml-metal.m b/ggml/src/ggml-metal.m

index 6d8a7c898f94ef25a485f9b802f8584c3acecfaa..6c85acfecb2ce6f43f16e5593472ca682be8bfb1 100644 (file)
--- a/ggml/src/ggml-metal.m
+++ b/ggml/src/ggml-metal.m
@@ -1,7 +1,7 @@
  #import "ggml-metal.h"
  
+#import "ggml-impl.h"
  #import "ggml-backend-impl.h"
-#import "ggml.h"
  
  #import <Foundation/Foundation.h>
  
@@ -882,7 +882,7 @@ static enum ggml_status ggml_metal_graph_compute(
      // create multiple command buffers and enqueue them
      // then, we encode the graph into the command buffers in parallel
  
-    const int n_nodes  = gf->n_nodes;
+    const int n_nodes = gf->n_nodes;
      const int n_cb = ctx->n_cb;
      const int n_nodes_per_cb = (n_nodes + n_cb - 1) / n_cb;
  
diff --git a/ggml/src/ggml-rpc.cpp b/ggml/src/ggml-rpc.cpp

index 9c600c7cae4f9336a7bd92c9411040395756c5fc..a8a2eb85adc23bd8bae243adcfb863712907f13f 100644 (file)
--- a/ggml/src/ggml-rpc.cpp
+++ b/ggml/src/ggml-rpc.cpp
@@ -1,5 +1,5 @@
  #include "ggml-rpc.h"
-#include "ggml.h"
+#include "ggml-impl.h"
  #include "ggml-backend-impl.h"
  
  #include <cinttypes>
diff --git a/ggml/src/ggml-sycl.cpp b/ggml/src/ggml-sycl.cpp

index e603503996f2ea400be7c0f4c12aeb3cc56c5e70..acef7c6d4e1eaea92fe6fffaa9ff44668a8af68d 100644 (file)
--- a/ggml/src/ggml-sycl.cpp
+++ b/ggml/src/ggml-sycl.cpp
@@ -33,7 +33,7 @@
  #include <sycl/half_type.hpp>
  
  #include "ggml-sycl.h"
-#include "ggml.h"
+#include "ggml-impl.h"
  #include "ggml-backend-impl.h"
  
  #include "ggml-sycl/backend.hpp"
diff --git a/ggml/src/ggml-vulkan.cpp b/ggml/src/ggml-vulkan.cpp

index 83737c1d95e6edaaa6825f66f52ed237ffb4f553..bad960510850ef139f79576f73e5d21fe96ecd21 100644 (file)
--- a/ggml/src/ggml-vulkan.cpp
+++ b/ggml/src/ggml-vulkan.cpp
@@ -21,7 +21,7 @@
  #include <memory>
  #include <mutex>
  
-#include "ggml.h"
+#include "ggml-impl.h"
  #include "ggml-backend-impl.h"
  
  #include "ggml-vulkan-shaders.hpp"
diff --git a/ggml/src/ggml.c b/ggml/src/ggml.c

index d7157ca6d4b838e8f2823b6850727be0ae325589..47417c02413dba5c2a78cc7842c9be788553dbea 100644 (file)
--- a/ggml/src/ggml.c
+++ b/ggml/src/ggml.c
@@ -287,6 +287,7 @@ void ggml_abort(const char * file, int line, const char * fmt, ...) {
  #define GGML_DEBUG 0
  #define GGML_GELU_FP16
  #define GGML_GELU_QUICK_FP16
+#define GGML_N_TASKS_MAX (-1)
  
  #define GGML_SOFT_MAX_UNROLL 4
  #define GGML_VEC_DOT_UNROLL  2
@@ -1120,21 +1121,21 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
  #define GGML_F32x4_ADD          vaddq_f32
  #define GGML_F32x4_MUL          vmulq_f32
  #define GGML_F32x4_REDUCE_ONE(x) vaddvq_f32(x)
-#define GGML_F32x4_REDUCE(res, x)              \
-{                                              \
-    int offset = GGML_F32_ARR >> 1;            \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-    }                                          \
-    offset >>= 1;                              \
-    for (int i = 0; i < offset; ++i) {         \
-        x[i] = vaddq_f32(x[i], x[offset+i]);   \
-    }                                          \
-    res = GGML_F32x4_REDUCE_ONE(x[0]);         \
+#define GGML_F32x4_REDUCE(res, x)                  \
+{                                                  \
+    int offset = GGML_F32_ARR >> 1;                \
+    for (int i = 0; i < offset; ++i) {             \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+    }                                              \
+    offset >>= 1;                                  \
+    for (int i = 0; i < offset; ++i) {             \
+        (x)[i] = vaddq_f32((x)[i], (x)[offset+i]); \
+    }                                              \
+    (res) = GGML_F32x4_REDUCE_ONE((x)[0]);         \
  }
  
  #define GGML_F32_VEC        GGML_F32x4
@@ -1161,30 +1162,30 @@ ggml_type_traits_t ggml_internal_get_type_traits(enum ggml_type type) {
      #define GGML_F16x8_FMA(a, b, c) vfmaq_f16(a, b, c)
      #define GGML_F16x8_ADD          vaddq_f16
      #define GGML_F16x8_MUL          vmulq_f16
-    #define GGML_F16x8_REDUCE(res, x)                             \
-    do {                                                          \
-        int offset = GGML_F16_ARR >> 1;                           \
-        for (int i = 0; i < offset; ++i) {                        \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-        }                                                         \
-        offset >>= 1;                                             \
-        for (int i = 0; i < offset; ++i) {                        \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-        }                                                         \
-        offset >>= 1;                                             \
-        for (int i = 0; i < offset; ++i) {                        \
-            x[i] = vaddq_f16(x[i], x[offset+i]);                  \
-        }                                                         \
-        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 (x[0])); \
-        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16(x[0])); \
-        res = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
+    #define GGML_F16x8_REDUCE(res, x)                               \
+    do {                                                            \
+        int offset = GGML_F16_ARR >> 1;                             \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        offset >>= 1;                                               \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        offset >>= 1;                                               \
+        for (int i = 0; i < offset; ++i) {                          \
+            (x)[i] = vaddq_f16((x)[i], (x)[offset+i]);              \
+        }                                                           \
+        const float32x4_t t0 = vcvt_f32_f16(vget_low_f16 ((x)[0])); \
+        const float32x4_t t1 = vcvt_f32_f16(vget_high_f16((x)[0])); \
+        (res) = (ggml_float) vaddvq_f32(vaddq_f32(t0, t1));         \
      } while (0)
  
      #define GGML_F16_VEC                GGML_F16x8
      #define GGML_F16_VEC_ZERO           GGML_F16x8_ZERO
      #define GGML_F16_VEC_SET1           GGML_F16x8_SET1
      #define GGML_F16_VEC_LOAD(p, i)     GGML_F16x8_LOAD(p)
-    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), r[i])
+    #define GGML_F16_VEC_STORE(p, r, i) GGML_F16x8_STORE((ggml_fp16_internal_t *)(p), (r)[i])
      #define GGML_F16_VEC_FMA            GGML_F16x8_FMA
      #define GGML_F16_VEC_ADD            GGML_F16x8_ADD
      #define GGML_F16_VEC_MUL            GGML_F16x8_MUL
@@ -1893,6 +1894,23 @@ static inline void __lsx_f16x4_store(ggml_fp16_t * x, __m128 y) {
  #define GGML_F16_ARR (GGML_F16_STEP/GGML_F16_EPR)
  #endif
  
+//
+// ggml object
+//
+
+struct ggml_object {
+    size_t offs;
+    size_t size;
+
+    struct ggml_object * next;
+
+    enum ggml_object_type type;
+
+    char padding[4];
+};
+
+static const size_t GGML_OBJECT_SIZE = sizeof(struct ggml_object);
+
  //
  // ggml context
  //
@@ -19161,6 +19179,34 @@ void ggml_graph_clear(struct ggml_cgraph * cgraph) {
      ggml_hash_set_reset(&cgraph->visited_hash_set);
  }
  
+int ggml_graph_size(struct ggml_cgraph * cgraph) {
+    return cgraph->size;
+}
+
+struct ggml_tensor * ggml_graph_node(struct ggml_cgraph * cgraph, int i) {
+    if (i < 0) {
+        GGML_ASSERT(cgraph->n_nodes + i >= 0);
+        return cgraph->nodes[cgraph->n_nodes + i];
+    }
+
+    GGML_ASSERT(i < cgraph->n_nodes);
+    return cgraph->nodes[i];
+}
+
+struct ggml_tensor ** ggml_graph_nodes(struct ggml_cgraph * cgraph) {
+    return cgraph->nodes;
+}
+
+int ggml_graph_n_nodes(struct ggml_cgraph * cgraph) {
+    return cgraph->n_nodes;
+}
+
+void ggml_graph_add_node(struct ggml_cgraph * cgraph, struct ggml_tensor * tensor) {
+    GGML_ASSERT(cgraph->size > cgraph->n_nodes);
+    cgraph->nodes[cgraph->n_nodes] = tensor;
+    cgraph->n_nodes++;
+}
+
  // Android's libc implementation "bionic" does not support setting affinity
  #if defined(__gnu_linux__)
  static void set_numa_thread_affinity(int thread_n) {
diff --git a/src/llama.cpp b/src/llama.cpp

index f1a95b3a3d09fdcf30458483f1c6898b0e9e4e89..0f80b2402728e1898e10a04ae34e1b3c671b46ac 100644 (file)
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -9877,8 +9877,8 @@ struct llm_build_context {
      struct ggml_cgraph * append_pooling(struct ggml_cgraph * gf) {
          // find result_norm tensor for input
          struct ggml_tensor * inp = nullptr;
-        for (int i = gf->n_nodes - 1; i >= 0; --i) {
-            inp = gf->nodes[i];
+        for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+            inp = ggml_graph_node(gf, i);
              if (strcmp(inp->name, "result_norm") == 0 || strcmp(inp->name, "result_embd") == 0) {
                  break;
              } else {
@@ -16207,8 +16207,8 @@ static int llama_decode_internal(
          ggml_cgraph * gf = llama_build_graph(lctx, ubatch, false);
  
          // the output is always the last tensor in the graph
-        struct ggml_tensor * res  = gf->nodes[gf->n_nodes - 1];
-        struct ggml_tensor * embd = gf->nodes[gf->n_nodes - 2];
+        struct ggml_tensor * res  = ggml_graph_node(gf, -1);
+        struct ggml_tensor * embd = ggml_graph_node(gf, -2);
  
          if (lctx.n_outputs == 0) {
              // no output
@@ -16217,9 +16217,9 @@ static int llama_decode_internal(
          } else if (cparams.embeddings) {
              res  = nullptr; // do not extract logits for embedding case
              embd = nullptr;
-            for (int i = gf->n_nodes - 1; i >= 0; --i) {
-                if (strcmp(gf->nodes[i]->name, "result_embd_pooled") == 0) {
-                    embd = gf->nodes[i];
+            for (int i = ggml_graph_n_nodes(gf) - 1; i >= 0; --i) {
+                if (strcmp(ggml_graph_node(gf, i)->name, "result_embd_pooled") == 0) {
+                    embd = ggml_graph_node(gf, i);
                      break;
                  }
              }
@@ -16436,15 +16436,15 @@ static int llama_encode_internal(
      // there are two cases here
      if (llama_model_has_decoder(&lctx.model)) {
          // first case is an encoder-decoder T5 model where embeddings are passed to decoder
-        embd = gf->nodes[gf->n_nodes - 1];
+        embd = ggml_graph_node(gf, -1);
          GGML_ASSERT(strcmp(embd->name, "result_norm") == 0 && "missing result_output tensor");
      } else {
          // second case is an encoder-only T5 model
          if (cparams.embeddings) {
              // only output embeddings if required
-            embd = gf->nodes[gf->n_nodes - 1];
+            embd = ggml_graph_node(gf, -1);
              if (strcmp(embd->name, "result_embd_pooled") != 0) {
-                embd = gf->nodes[gf->n_nodes - 2];
+                embd = ggml_graph_node(gf, -2);
              }
              GGML_ASSERT(strcmp(embd->name, "result_embd_pooled") == 0 && "missing embeddings tensor");
          }
@@ -18492,7 +18492,7 @@ struct llama_context * llama_new_context_with_model(
  
              // note: the number of splits during measure is higher than during inference due to the kv shift
              int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
-            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, gf->n_nodes);
+            LLAMA_LOG_INFO("%s: graph nodes  = %d\n", __func__, ggml_graph_n_nodes(gf));
              LLAMA_LOG_INFO("%s: graph splits = %d\n", __func__, n_splits);
          }
      }
diff --git a/tests/test-backend-ops.cpp b/tests/test-backend-ops.cpp

index 635de01d70439c60d1c7a91b7f0ef89f2a236740..aa7896defdad0721752e09f45d6cccb761ac5d54 100644 (file)
--- a/tests/test-backend-ops.cpp
+++ b/tests/test-backend-ops.cpp
@@ -519,7 +519,7 @@ struct test_case {
  
          // add sentinels as graph nodes so that they are checked in the callback
          for (ggml_tensor * sentinel : sentinels) {
-            gf->nodes[gf->n_nodes++] = sentinel;
+            ggml_graph_add_node(gf, sentinel);
          }
  
          // randomize tensors
@@ -679,9 +679,9 @@ struct test_case {
  
          // duplicate the op
          size_t target_size = ggml_backend_is_cpu(backend) ? 1ULL << 33 : 1ULL << 35; // 8 GB CPU, 32 GB GPU
-        int n_runs = std::min((size_t)gf->size - gf->n_nodes, target_size / op_size(out)) + 1;
+        int n_runs = std::min((size_t) ggml_graph_size(gf) - ggml_graph_n_nodes(gf), target_size / op_size(out)) + 1;
          for (int i = 1; i < n_runs; i++) {
-            gf->nodes[gf->n_nodes++] = out;
+            ggml_graph_add_node(gf, out);
          }
  
          // calculate memory
@@ -696,11 +696,11 @@ struct test_case {
              }
              return size;
          };
-        for (int i = 0; i < gf->n_nodes; i++) {
-            if (ggml_is_view_op(gf->nodes[i]->op) || gf->nodes[i] == out) {
+        for (int i = 0; i < ggml_graph_n_nodes(gf); ++i) {
+            if (ggml_is_view_op(ggml_graph_node(gf, i)->op) || ggml_graph_node(gf, i) == out) {
                  continue;
              }
-            mem += tensor_op_size(gf->nodes[i]);
+            mem += tensor_op_size(ggml_graph_node(gf, i));
          }
  
          // run
@@ -804,7 +804,7 @@ struct test_case {
          ggml_graph_cpy(gf, gb);
          ggml_build_backward_expand(ctx, gf, gb, false);
          if (expect.size() != 1 || expect[0] != 0.0f) {
-            GGML_ASSERT(gb->n_nodes > gf->n_nodes);
+            GGML_ASSERT(ggml_graph_n_nodes(gb) > ggml_graph_n_nodes(gf));
              for (ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
                  GGML_ASSERT(!(t->flags & GGML_TENSOR_FLAG_PARAM) || t->grad->op != GGML_OP_NONE);
              }
author	Georgi Gerganov <redacted>
	Thu, 12 Sep 2024 11:23:49 +0000 (14:23 +0300)
committer	GitHub <redacted>
	Thu, 12 Sep 2024 11:23:49 +0000 (14:23 +0300)
examples/benchmark/benchmark-matmult.cpp		patch \| blob \| history
examples/cvector-generator/pca.hpp		patch \| blob \| history
examples/export-lora/export-lora.cpp		patch \| blob \| history
examples/llava/clip.cpp		patch \| blob \| history
examples/llava/llava.cpp		patch \| blob \| history
ggml/include/ggml.h		patch \| blob \| history
ggml/src/ggml-blas.cpp		patch \| blob \| history
ggml/src/ggml-cann.cpp		patch \| blob \| history
ggml/src/ggml-cuda.cu		patch \| blob \| history
ggml/src/ggml-impl.h		patch \| blob \| history
ggml/src/ggml-kompute.cpp		patch \| blob \| history
ggml/src/ggml-metal.m		patch \| blob \| history
ggml/src/ggml-rpc.cpp		patch \| blob \| history
ggml/src/ggml-sycl.cpp		patch \| blob \| history
ggml/src/ggml-vulkan.cpp		patch \| blob \| history
ggml/src/ggml.c		patch \| blob \| history
src/llama.cpp		patch \| blob \| history
tests/test-backend-ops.cpp		patch \| blob \| history