////////////////////////////////////////////////////////////////////////////////
void ggml_print_object(const struct ggml_object * obj) {
- GGML_PRINT(" - ggml_object: offset = %zu, size = %zu, next = %p\n",
- obj->offs, obj->size, (const void *) obj->next);
+ GGML_PRINT(" - ggml_object: type = %d, offset = %zu, size = %zu, next = %p\n",
+ obj->type, obj->offs, obj->size, (const void *) obj->next);
}
void ggml_print_objects(const struct ggml_context * ctx) {
}
size_t ggml_tensor_overhead(void) {
- return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE + 16;
+ return GGML_OBJECT_SIZE + GGML_TENSOR_SIZE;
}
bool ggml_is_transposed(const struct ggml_tensor * tensor) {
return NULL;
}
- const size_t mem_size = (params.mem_size + GGML_MEM_ALIGN - 1) & ~(GGML_MEM_ALIGN - 1);
+ const size_t mem_size = params.mem_buffer ? params.mem_size : GGML_PAD(params.mem_size, GGML_MEM_ALIGN);
*ctx = (struct ggml_context) {
/*.mem_size =*/ mem_size,
struct ggml_object * obj = ctx->objects_begin;
while (obj != NULL) {
- struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
+ if (obj->type == GGML_OBJECT_TENSOR) {
+ struct ggml_tensor * tensor = (struct ggml_tensor *) ((char *) ctx->mem_buffer + obj->offs);
- const size_t size = ggml_nbytes(tensor);
+ const size_t size = ggml_nbytes(tensor);
- if (max_size < size) {
- max_size = size;
+ if (max_size < size) {
+ max_size = size;
+ }
}
obj = obj->next;
////////////////////////////////////////////////////////////////////////////////
-static struct ggml_tensor * ggml_new_tensor_impl(
- struct ggml_context * ctx,
- enum ggml_type type,
- int n_dims,
- const int64_t* ne,
- void* data) {
+static struct ggml_object * ggml_new_object(struct ggml_context * ctx, enum ggml_object_type type, size_t size) {
// always insert objects at the end of the context's memory pool
struct ggml_object * obj_cur = ctx->objects_end;
const size_t cur_size = obj_cur == NULL ? 0 : obj_cur->size;
const size_t cur_end = cur_offs + cur_size;
- size_t size_needed = 0;
-
- if (data == NULL && !ctx->no_alloc) {
- size_needed += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
- for (int i = 1; i < n_dims; i++) {
- size_needed *= ne[i];
- }
- // align to GGML_MEM_ALIGN
- size_needed = ((size_needed + GGML_MEM_ALIGN - 1)/GGML_MEM_ALIGN)*GGML_MEM_ALIGN;
- }
+ // align to GGML_MEM_ALIGN
+ size_t size_needed = GGML_PAD(size, GGML_MEM_ALIGN);
char * const mem_buffer = ctx->mem_buffer;
struct ggml_object * const obj_new = (struct ggml_object *)(mem_buffer + cur_end);
- if (ctx->scratch.data == NULL || data != NULL) {
- size_needed += GGML_TENSOR_SIZE;
+ if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
+ GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
+ __func__, cur_end + size_needed, ctx->mem_size);
+ assert(false);
+ return NULL;
+ }
- if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
- __func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
- assert(false);
- return NULL;
- }
+ *obj_new = (struct ggml_object) {
+ .offs = cur_end + GGML_OBJECT_SIZE,
+ .size = size_needed,
+ .next = NULL,
+ .type = type,
+ };
- *obj_new = (struct ggml_object) {
- .offs = cur_end + GGML_OBJECT_SIZE,
- .size = size_needed,
- .next = NULL,
- };
+ ggml_assert_aligned(mem_buffer + obj_new->offs);
+
+ if (obj_cur != NULL) {
+ obj_cur->next = obj_new;
} else {
- if (ctx->scratch.offs + size_needed > ctx->scratch.size) {
- GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
- __func__, ctx->scratch.offs + size_needed, ctx->scratch.size);
- assert(false);
- return NULL;
+ // this is the first object in this context
+ ctx->objects_begin = obj_new;
+ }
+
+ ctx->objects_end = obj_new;
+
+ //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
+
+ return obj_new;
+}
+
+static struct ggml_tensor * ggml_new_tensor_impl(
+ struct ggml_context * ctx,
+ enum ggml_type type,
+ int n_dims,
+ const int64_t* ne,
+ void* data) {
+
+ size_t data_size = 0;
+
+ if (data == NULL && !ctx->no_alloc) {
+ data_size += GGML_TYPE_SIZE[type]*(ne[0]/GGML_BLCK_SIZE[type]);
+ for (int i = 1; i < n_dims; i++) {
+ data_size *= ne[i];
}
+ }
- if (cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE > ctx->mem_size) {
- GGML_PRINT("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
- __func__, cur_end + GGML_TENSOR_SIZE + GGML_OBJECT_SIZE, ctx->mem_size);
+ if (ctx->scratch.data != NULL && data == NULL) {
+ // allocate tensor data in the scratch buffer
+ if (ctx->scratch.offs + data_size > ctx->scratch.size) {
+ GGML_PRINT("%s: not enough space in the scratch memory pool (needed %zu, available %zu)\n",
+ __func__, ctx->scratch.offs + data_size, ctx->scratch.size);
assert(false);
return NULL;
}
data = (char * const) ctx->scratch.data + ctx->scratch.offs;
- *obj_new = (struct ggml_object) {
- .offs = cur_end + GGML_OBJECT_SIZE,
- .size = GGML_TENSOR_SIZE,
- .next = NULL,
- };
-
- //printf("scratch offs = %zu, size_needed = %zu\n", ctx->scratch.offs, size_needed);
+ ctx->scratch.offs += data_size;
- ctx->scratch.offs += size_needed;
+ data_size = 0;
}
- if (obj_cur != NULL) {
- obj_cur->next = obj_new;
- } else {
- // this is the first object in this context
- ctx->objects_begin = obj_new;
- }
-
- ctx->objects_end = obj_new;
-
- //printf("%s: inserted new object at %zu, size = %zu\n", __func__, cur_end, obj_new->size);
+ struct ggml_object * const obj_new = ggml_new_object(ctx, GGML_OBJECT_TENSOR, GGML_TENSOR_SIZE + data_size);
- struct ggml_tensor * const result = (struct ggml_tensor *)(mem_buffer + obj_new->offs);
+ // TODO: for recoverable errors, we would need to free the data allocated from the scratch buffer here
- ggml_assert_aligned(result);
+ struct ggml_tensor * const result = (struct ggml_tensor *)((char *)ctx->mem_buffer + obj_new->offs);
*result = (struct ggml_tensor) {
/*.type =*/ type,
char * const mem_buffer = ctx->mem_buffer;
while (obj != NULL) {
- struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
- if (strcmp(cur->name, name) == 0) {
- return cur;
+ if (obj->type == GGML_OBJECT_TENSOR) {
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
+ if (strcmp(cur->name, name) == 0) {
+ return cur;
+ }
}
obj = obj->next;
return result;
}
+struct ggml_cgraph * ggml_new_graph(struct ggml_context * ctx) {
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_GRAPH, GGML_GRAPH_SIZE);
+ struct ggml_cgraph * cgraph = (struct ggml_cgraph *) ((char *) ctx->mem_buffer + obj->offs);
+
+ *cgraph = (struct ggml_cgraph) {
+ /*.n_nodes =*/ 0,
+ /*.n_leafs =*/ 0,
+ /*.nodes =*/ { NULL },
+ /*.grads =*/ { NULL },
+ /*.leafs =*/ { NULL },
+ /*.hash_table =*/ { NULL },
+ /*.perf_runs =*/ 0,
+ /*.perf_cycles =*/ 0,
+ /*.perf_time_us =*/ 0,
+ };
+
+ return cgraph;
+}
+
+struct ggml_cgraph * ggml_build_forward_ctx(struct ggml_context * ctx, struct ggml_tensor * tensor) {
+ struct ggml_cgraph * cgraph = ggml_new_graph(ctx);
+ ggml_build_forward_impl(cgraph, tensor, false);
+ return cgraph;
+}
+
+size_t ggml_graph_overhead(void) {
+ return GGML_OBJECT_SIZE + GGML_PAD(GGML_GRAPH_SIZE, GGML_MEM_ALIGN);
+}
+
//
// thread data
//
void ggml_graph_compute_with_ctx(struct ggml_context * ctx, struct ggml_cgraph * cgraph, int n_threads) {
struct ggml_cplan cplan = ggml_graph_plan(cgraph, n_threads);
- struct ggml_tensor * buf = ggml_new_tensor_1d(ctx, GGML_TYPE_I8, cplan.work_size);
- GGML_ASSERT(buf);
+ struct ggml_object * obj = ggml_new_object(ctx, GGML_OBJECT_WORK_BUFFER, cplan.work_size);
- cplan.work_data = buf->data;
+ cplan.work_data = (uint8_t *)ctx->mem_buffer + obj->offs;
ggml_graph_compute(cgraph, &cplan);
}
struct ggml_context * ctx0 = ggml_init(params);
- ggml_cgraph gf = {};
+ ggml_cgraph * gf = ggml_new_graph(ctx0);
// for big prompts, if BLAS is enabled, it is better to use only one thread
// otherwise, the threads are spin-lock waiting for the BLAS calls and are degrading the performance
ggml_set_name(v, "v");
// important: storing RoPE-ed version of K in the KV cache!
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Kcur, k));
- ggml_build_forward_expand(&gf, ggml_cpy(ctx0, Vcur, v));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Kcur, k));
+ ggml_build_forward_expand(gf, ggml_cpy(ctx0, Vcur, v));
}
struct ggml_tensor * Q =
//cur = ggml_soft_max_inplace(ctx0, cur);
// run the computation
- ggml_build_forward_expand(&gf, cur);
+ ggml_build_forward_expand(gf, cur);
// fprintf(stderr, "graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf.n_nodes, gf.n_leafs);
#if GGML_USE_MPI
- ggml_mpi_graph_compute_pre(lctx.ctx_mpi, &gf, n_layer);
+ ggml_mpi_graph_compute_pre(lctx.ctx_mpi, gf, n_layer);
#endif
#ifdef GGML_USE_METAL
if (lctx.ctx_metal && N == 1) {
if (!ggml_metal_if_optimized(lctx.ctx_metal)) {
- ggml_metal_graph_find_concurrency(lctx.ctx_metal,&gf);
+ ggml_metal_graph_find_concurrency(lctx.ctx_metal, gf);
}
ggml_metal_set_n_cb (lctx.ctx_metal, n_threads);
- ggml_metal_graph_compute(lctx.ctx_metal, &gf);
+ ggml_metal_graph_compute(lctx.ctx_metal, gf);
ggml_metal_get_tensor (lctx.ctx_metal, cur);
} else {
// IMPORTANT:
ggml_metal_get_tensor(lctx.ctx_metal, kv_self.v);
}
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
}
#else
- ggml_graph_compute_helper(lctx.work_buffer, &gf, n_threads);
+ ggml_graph_compute_helper(lctx.work_buffer, gf, n_threads);
#endif
#if GGML_USE_MPI
- ggml_mpi_graph_compute_post(lctx.ctx_mpi, &gf, n_layer);
+ ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
#endif
// update kv token count
lctx.kv_self.n = n_past + N;
- struct ggml_tensor * res = gf.nodes[gf.n_nodes - 1];
+ struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
if (cgraph_fname) {
- ggml_graph_export(&gf, cgraph_fname);
+ ggml_graph_export(gf, cgraph_fname);
}
#ifdef GGML_PERF
// print timing information per ggml operation (for debugging purposes)
// requires GGML_PERF to be defined
- ggml_graph_print(&gf);
+ ggml_graph_print(gf);
#endif
// plot the computation graph in dot format (for debugging purposes)
//if (n_past%100 == 0) {
- // ggml_graph_dump_dot(&gf, NULL, "llama.dot");
+ // ggml_graph_dump_dot(gf, NULL, "llama.dot");
//}
// extract logits
ctx->embedding.resize(hparams.n_embd);
}
- ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type));
+ ctx->buf_compute.resize(MEM_REQ_EVAL().at(ctx->model.type) + ggml_graph_overhead());
ctx->buf_scratch[0].resize(MEM_REQ_SCRATCH0(hparams.n_ctx).at(ctx->model.type));
ctx->buf_scratch[1].resize(MEM_REQ_SCRATCH1().at(ctx->model.type));