// backend buffer type
+const char * ggml_backend_buft_name(ggml_backend_buffer_type_t buft) {
+ return buft->iface.get_name(buft);
+}
+
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
return buft->iface.alloc_buffer(buft, size);
}
/* .buft = */ buft,
/* .context = */ context,
/* .size = */ size,
+ /* .usage = */ GGML_BACKEND_BUFFER_USAGE_ANY
};
return buffer;
}
+const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
+ return buffer->iface.get_name(buffer);
+}
+
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
if (buffer == NULL) {
return;
}
size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer) {
- return ggml_backend_buft_get_alignment(ggml_backend_buffer_type(buffer));
+ return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
}
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
- return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type(buffer), tensor);
+ return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
}
void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
}
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
- return ggml_backend_buft_is_host(ggml_backend_buffer_type(buffer));
+ return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
}
-ggml_backend_buffer_type_t ggml_backend_buffer_type(ggml_backend_buffer_t buffer) {
+void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
+ buffer->usage = usage;
+}
+
+ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
return buffer->buft;
}
+void ggml_backend_buffer_reset(ggml_backend_buffer_t buffer) {
+ if (buffer->iface.reset) {
+ buffer->iface.reset(buffer);
+ }
+}
+
+bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst) {
+ ggml_backend_buffer_t dst_buf = dst->view_src ? dst->view_src->buffer : dst->buffer;
+ if (dst_buf->iface.cpy_tensor) {
+ return src->buffer->iface.cpy_tensor(dst_buf, src, dst);
+ }
+ return false;
+}
+
// backend
const char * ggml_backend_name(ggml_backend_t backend) {
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
- backend->iface.set_tensor_async(backend, tensor, data, offset, size);
+ if (backend->iface.set_tensor_async == NULL) {
+ ggml_backend_tensor_set(tensor, data, offset, size);
+ } else {
+ backend->iface.set_tensor_async(backend, tensor, data, offset, size);
+ }
}
void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
- backend->iface.get_tensor_async(backend, tensor, data, offset, size);
+ if (backend->iface.get_tensor_async == NULL) {
+ ggml_backend_tensor_get(tensor, data, offset, size);
+ } else {
+ backend->iface.get_tensor_async(backend, tensor, data, offset, size);
+ }
}
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
- GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
- tensor->buffer->iface.set_tensor(tensor->buffer, tensor, data, offset, size);
+ tensor->buffer->iface.set_tensor(buf, tensor, data, offset, size);
}
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+ ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(tensor->buffer != NULL && "tensor buffer not set");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
- tensor->buffer->iface.get_tensor(tensor->buffer, tensor, data, offset, size);
+ tensor->buffer->iface.get_tensor(buf, tensor, data, offset, size);
}
void ggml_backend_synchronize(ggml_backend_t backend) {
void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
backend->iface.graph_plan_compute(backend, plan);
-
- // TODO: optional sync
- ggml_backend_synchronize(backend);
}
bool ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
- if (!backend->iface.graph_compute(backend, cgraph)) {
- return false;
- }
-
- // TODO: optional sync
- ggml_backend_synchronize(backend);
- return true;
+ return backend->iface.graph_compute(backend, cgraph);
}
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
}
void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
- //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
- //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
- // fprintf(stderr, "cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
-
if (src == dst) {
return;
}
- // TODO: allow backends to support copy to/from same backend
-
- if (dst->buffer->iface.cpy_tensor_from != NULL) {
- dst->buffer->iface.cpy_tensor_from(dst->buffer, src, dst);
- } else if (src->buffer->iface.cpy_tensor_to != NULL) {
- src->buffer->iface.cpy_tensor_to(src->buffer, src, dst);
- } else {
- // shouldn't be hit when copying from/to CPU
- #ifndef NDEBUG
- fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to "
- "are implemented for %s and %s, falling back to get/set\n", src->name, dst->name);
- #endif
+ if (ggml_backend_buffer_is_host(src->buffer)) {
+ ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
+ } else if (ggml_backend_buffer_is_host(dst->buffer)) {
+ ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
+ } else if (!ggml_backend_buffer_copy_tensor(src, dst)) {
+#ifndef NDEBUG
+ fprintf(stderr, "%s: warning: slow copy from %s to %s\n", __func__, ggml_backend_buffer_name(src->buffer), ggml_backend_buffer_name(dst->buffer));
+#endif
size_t nbytes = ggml_nbytes(src);
void * data = malloc(nbytes);
ggml_backend_tensor_get(src, data, 0, nbytes);
}
}
+void ggml_backend_tensor_copy_async(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+ GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
+
+ if (src == dst) {
+ return;
+ }
+
+ if (ggml_backend_buft_supports_backend(src->buffer->buft, backend) && ggml_backend_buft_supports_backend(dst->buffer->buft, backend)) {
+ if (backend->iface.cpy_tensor_async != NULL) {
+ if (backend->iface.cpy_tensor_async(backend, src, dst)) {
+ return;
+ }
+ }
+ }
+
+ size_t nbytes = ggml_nbytes(src);
+ if (ggml_backend_buffer_is_host(src->buffer)) {
+ ggml_backend_tensor_set_async(backend, dst, src->data, 0, nbytes);
+ }
+ else {
+ ggml_backend_tensor_copy(src, dst);
+ }
+}
+
+
// backend registry
#define GGML_MAX_BACKENDS_REG 16
// backend CPU
+static const char * ggml_backend_cpu_buffer_name(ggml_backend_buffer_t buffer) {
+ return "CPU";
+
+ GGML_UNUSED(buffer);
+}
+
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
return (void *)buffer->context;
}
GGML_UNUSED(buffer);
}
-static void ggml_backend_cpu_buffer_cpy_tensor_from(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
- ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
-
- GGML_UNUSED(buffer);
-}
-
-static void ggml_backend_cpu_buffer_cpy_tensor_to(ggml_backend_buffer_t buffer, struct ggml_tensor * src, struct ggml_tensor * dst) {
- ggml_backend_tensor_set(dst, src->data, 0, ggml_nbytes(src));
+static bool ggml_backend_cpu_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const struct ggml_tensor * src, struct ggml_tensor * dst) {
+ if (ggml_backend_buffer_is_host(src->buffer)) {
+ memcpy(dst->data, src->data, ggml_nbytes(src));
+ return true;
+ }
+ return false;
GGML_UNUSED(buffer);
}
}
static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
+ /* .get_name = */ ggml_backend_cpu_buffer_name,
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
- /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
- /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
/* .clear = */ ggml_backend_cpu_buffer_clear,
+ /* .reset = */ NULL,
};
// for buffers from ptr, free is not called
static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
+ /* .get_name = */ ggml_backend_cpu_buffer_name,
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
/* .set_tensor = */ ggml_backend_cpu_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cpu_buffer_get_tensor,
- /* .cpy_tensor_from = */ ggml_backend_cpu_buffer_cpy_tensor_from,
- /* .cpy_tensor_to = */ ggml_backend_cpu_buffer_cpy_tensor_to,
+ /* .cpy_tensor = */ ggml_backend_cpu_buffer_cpy_tensor,
/* .clear = */ ggml_backend_cpu_buffer_clear,
+ /* .reset = */ NULL,
};
static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
+static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+ return "CPU";
+
+ GGML_UNUSED(buft);
+}
+
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
ggml_backend_buffer_type_t ggml_backend_cpu_buffer_type(void) {
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
/* .iface = */ {
+ /* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
#include <hbwmalloc.h>
+static const char * ggml_backend_cpu_hbm_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
+ return "CPU_HBM";
+
+ GGML_UNUSED(buft);
+}
+
+static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
+ return "CPU_HBM";
+
+ GGML_UNUSED(buf);
+}
+
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
hbw_free(buffer->context);
}
return NULL;
}
- // FIXME: this is a hack to avoid having to implement a new buffer type
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft;
+ buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
return buffer;
}
-ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type() {
+ggml_backend_buffer_type_t ggml_backend_cpu_hbm_buffer_type(void) {
static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type_hbm = {
/* .iface = */ {
+ /* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
struct ggml_cgraph cgraph;
};
-static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, const struct ggml_cgraph * cgraph) {
struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
/* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
- /* .cpy_tensor_from_async = */ NULL,
- /* .cpy_tensor_to_async = */ NULL,
+ /* .cpy_tensor_async = */ NULL,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
}
bool ggml_backend_is_cpu(ggml_backend_t backend) {
- return backend->iface.get_name == ggml_backend_cpu_name;
+ return backend && backend->iface.get_name == ggml_backend_cpu_name;
}
void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
// scheduler
-#define GGML_MAX_BACKENDS 4
+#define GGML_MAX_BACKENDS 16
#define GGML_MAX_SPLITS 256
#define GGML_MAX_SPLIT_INPUTS 16
int i_end;
struct ggml_tensor * inputs[GGML_MAX_SPLIT_INPUTS];
int n_inputs;
+ // graph view of this split
struct ggml_cgraph graph;
};
struct ggml_backend_sched {
+ bool is_reset; // true if the scheduler has been reset since the last graph split
+
int n_backends;
ggml_backend_t backends[GGML_MAX_BACKENDS];
+ ggml_backend_buffer_type_t bufts[GGML_MAX_BACKENDS];
ggml_tallocr_t tallocs[GGML_MAX_BACKENDS];
ggml_gallocr_t galloc;
+ // hash keys of the nodes in the graph
struct ggml_hash_set hash_set;
- ggml_tallocr_t * node_talloc; // [hash_set.size]
- struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // [hash_set.size][GGML_MAX_BACKENDS]
+ // hash values (arrays of [hash_set.size])
+ ggml_tallocr_t * node_talloc; // tallocr assigned to each node (indirectly this is the backend)
+ struct ggml_tensor * (* node_copies)[GGML_MAX_BACKENDS]; // copies of each node for each destination backend
+ // copy of the graph with modified inputs
struct ggml_cgraph * graph;
+
struct ggml_backend_sched_split splits[GGML_MAX_SPLITS];
int n_splits;
return INT_MAX;
}
-static ggml_backend_t get_buffer_backend(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
+static ggml_tallocr_t sched_allocr_from_buffer(ggml_backend_sched_t sched, ggml_backend_buffer_t buffer) {
if (buffer == NULL) {
return NULL;
}
+
+ // check if this is already allocate in a allocr buffer (from user manual allocations)
+ for (int i = 0; i < sched->n_backends; i++) {
+ if (ggml_tallocr_get_buffer(sched->tallocs[i]) == buffer) {
+ return sched->tallocs[i];
+ }
+ }
+
// find highest prio backend that supports the buffer type
for (int i = 0; i < sched->n_backends; i++) {
if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
- return sched->backends[i];
+ return sched->tallocs[i];
}
}
GGML_ASSERT(false && "tensor buffer type not supported by any backend");
if (allocr == NULL) {
return NULL;
}
- // find highest prio backend that supports the buffer type
for (int i = 0; i < sched->n_backends; i++) {
if (sched->tallocs[i] == allocr) {
return sched->backends[i];
}
#if 0
-static char causes[GGML_DEFAULT_GRAPH_SIZE*8 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug, remove
+static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS][128]; // debug only
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
#define GET_CAUSE(node) causes[hash_id(node)]
#else
#endif
// returns the backend that should be used for the node based on the current locations
-static ggml_backend_t sched_backend_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
- // if the dst tensor is already allocated in a buffer, we must assume that it is critical to keep it there
- // ie. kv cache updates
- // note that this doesn't allow fallback to CPU. need to add output tensors to the splits to copy the data back to the original backend.
+static ggml_tallocr_t sched_allocr_from_cur(ggml_backend_sched_t sched, struct ggml_tensor * node) {
+ // assign pre-allocated nodes to their backend
// dst
- ggml_backend_t cur_backend = get_buffer_backend(sched, node->buffer);
- if (cur_backend != NULL) {
+ ggml_tallocr_t cur_allocr = sched_allocr_from_buffer(sched, node->buffer);
+ if (cur_allocr != NULL) {
SET_CAUSE(node, "1.dst");
- return cur_backend;
+ return cur_allocr;
}
-
// view_src
- if (node->view_src != NULL && get_buffer_backend(sched, node->view_src->buffer) != NULL) {
- SET_CAUSE(node, "1.vsrc");
- return get_buffer_backend(sched, node->view_src->buffer);
+ if (node->view_src != NULL) {
+ cur_allocr = sched_allocr_from_buffer(sched, node->view_src->buffer);
+ if (cur_allocr != NULL) {
+ SET_CAUSE(node, "1.vsrc");
+ return cur_allocr;
+ }
}
-
- // src
- int cur_prio = INT_MAX;
- size_t cur_size = 0;
-
+ // assign nodes that use weights to the backend of the weights
for (int i = 0; i < GGML_MAX_SRC; i++) {
const struct ggml_tensor * src = node->src[i];
if (src == NULL) {
break;
}
- ggml_backend_t src_backend = get_buffer_backend(sched, src->buffer);
- if (src_backend != NULL) {
- int src_prio = sched_backend_prio(sched, src_backend);
- size_t src_size = ggml_nbytes(src);
- if (src_prio < cur_prio && src_size >= cur_size) {
- cur_prio = src_prio;
- cur_size = src_size;
- cur_backend = src_backend;
- SET_CAUSE(node, "1.src%d", i);
- }
+ if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+ ggml_tallocr_t src_allocr = sched_allocr_from_buffer(sched, src->buffer);
+ // operations with weights are always run on the same backend as the weights
+ SET_CAUSE(node, "1.wgt%d", i);
+ return src_allocr;
}
}
- return cur_backend;
+
+ return NULL;
}
static char * fmt_size(size_t size) {
}
ggml_tallocr_t node_allocr = node_allocr(node);
ggml_backend_t node_backend = node_allocr ? get_allocr_backend(sched, node_allocr) : NULL; // FIXME:
- fprintf(stderr, "node #%3d (%10.10s): %20.20s (%4.4s) [%4.4s %8.8s]:", i, ggml_op_name(node->op), node->name,
+ fprintf(stderr, "node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
fmt_size(ggml_nbytes(node)), node_allocr ? ggml_backend_name(node_backend) : "NULL", GET_CAUSE(node));
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
}
ggml_tallocr_t src_allocr = node_allocr(src);
ggml_backend_t src_backend = src_allocr ? get_allocr_backend(sched, src_allocr) : NULL;
- fprintf(stderr, " %20.20s (%4.4s) [%4.4s %8.8s]", src->name,
+ fprintf(stderr, " %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
}
fprintf(stderr, "\n");
return dup;
}
+
+//#define DEBUG_PASS1
+//#define DEBUG_PASS2
+//#define DEBUG_PASS3
+//#define DEBUG_PASS4
+
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
-// TODO: merge passes
static void sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
- // reset state
- size_t hash_size = sched->hash_set.size;
- memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
- memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
- memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
+ // reset splits
sched->n_splits = 0;
+ sched->is_reset = false;
struct ggml_init_params params = {
/* .mem_size = */ sizeof(sched->context_buffer),
/* .no_alloc = */ true
};
- if (sched->ctx != NULL) {
- ggml_free(sched->ctx);
- }
+ ggml_free(sched->ctx);
sched->ctx = ggml_init(params);
+ if (sched->ctx == NULL) {
+ fprintf(stderr, "%s: failed to initialize context\n", __func__);
+ GGML_ASSERT(false);
+ }
- // pass 1: assign backends to ops with allocated inputs
+ // pass 1: assign backends to ops with pre-allocated inputs
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
if (node_allocr(leaf) != NULL) {
// do not overwrite user assignments
continue;
}
- ggml_backend_t leaf_backend = get_buffer_backend(sched, leaf->buffer);
- if (leaf_backend == NULL && leaf->view_src != NULL) {
- leaf_backend = get_buffer_backend(sched, leaf->view_src->buffer);
- }
- if (leaf_backend != NULL) {
- node_allocr(leaf) = ggml_backend_sched_get_tallocr(sched, leaf_backend);
- }
+ node_allocr(leaf) = sched_allocr_from_cur(sched, leaf);
}
for (int i = 0; i < graph->n_nodes; i++) {
// do not overwrite user assignments
continue;
}
- ggml_backend_t node_backend = sched_backend_from_cur(sched, node);
- if (node_backend != NULL) {
- node_allocr(node) = ggml_backend_sched_get_tallocr(sched, node_backend);
+ node_allocr(node) = sched_allocr_from_cur(sched, node);
+ // src
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ struct ggml_tensor * src = node->src[j];
+ if (src == NULL) {
+ break;
+ }
+ if (node_allocr(src) == NULL) {
+ node_allocr(src) = sched_allocr_from_cur(sched, src);
+ }
}
}
- //printf("PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#ifdef DEBUG_PASS1
+ fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#endif
- // pass 2: assign backends to ops from current assignments
- // TODO:
- // - reuse sched_backend_from_cur
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
- ggml_tallocr_t node_allocr = node_allocr(node);
- if (node_allocr == NULL) {
- int cur_prio = INT_MAX;
- size_t cur_size = 0;
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * src = node->src[j];
- if (src == NULL) {
- break;
+ // pass 2: expand current backend assignments
+ // assign the same backend to adjacent nodes
+ // expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
+ // thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
+
+ // pass 2.1 expand gpu up
+ {
+ ggml_tallocr_t cur_allocr = NULL;
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
+ struct ggml_tensor * node = graph->nodes[i];
+ if (ggml_is_view_op(node->op)) {
+ continue;
+ }
+ ggml_tallocr_t node_allocr = node_allocr(node);
+ if (node_allocr != NULL) {
+ if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
+ // skip cpu (lowest prio backend)
+ cur_allocr = NULL;
+ } else {
+ cur_allocr = node_allocr;
}
- ggml_tallocr_t src_allocr = node_allocr(src);
- if (src_allocr != NULL) {
- int src_prio = sched_allocr_prio(sched, src_allocr);
- size_t src_size = ggml_nbytes(src);
- if (src_prio < cur_prio && src_size >= cur_size) {
- cur_prio = src_prio;
- cur_size = src_size;
- node_allocr = src_allocr;
- SET_CAUSE(node, "2.src%d", j);
- }
+ } else {
+ node_allocr(node) = cur_allocr;
+ SET_CAUSE(node, "2.1");
+ }
+ }
+ }
+
+ // pass 2.2 expand gpu down
+ {
+ ggml_tallocr_t cur_allocr = NULL;
+ for (int i = 0; i < graph->n_nodes; i++) {
+ struct ggml_tensor * node = graph->nodes[i];
+ if (ggml_is_view_op(node->op)) {
+ continue;
+ }
+ ggml_tallocr_t node_allocr = node_allocr(node);
+ if (node_allocr != NULL) {
+ if (sched_allocr_prio(sched, node_allocr) == sched->n_backends - 1) {
+ // skip cpu (lowest prio backend)
+ cur_allocr = NULL;
+ } else {
+ cur_allocr = node_allocr;
}
+ } else {
+ node_allocr(node) = cur_allocr;
+ SET_CAUSE(node, "2.2");
}
+ }
+ }
+
+ // pass 2.3 expand rest up
+ {
+ ggml_tallocr_t cur_allocr = NULL;
+ for (int i = graph->n_nodes - 1; i >= 0; i--) {
+ struct ggml_tensor * node = graph->nodes[i];
+ if (ggml_is_view_op(node->op)) {
+ continue;
+ }
+ ggml_tallocr_t node_allocr = node_allocr(node);
if (node_allocr != NULL) {
- node_allocr(node) = node_allocr;
+ cur_allocr = node_allocr;
+ } else {
+ node_allocr(node) = cur_allocr;
+ SET_CAUSE(node, "2.3");
}
}
}
- //printf("PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#ifdef DEBUG_PASS2
+ fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#endif
- // pass 3: assign backends to remaining src from dst (should only be leafs)
+ // pass 3: assign backends to remaining src from dst and view_src
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
- ggml_tallocr_t node_allocr = node_allocr(node);
+ ggml_tallocr_t cur_allocr = node_allocr(node);
+ if (node->view_src != NULL && cur_allocr == NULL) {
+ cur_allocr = node_allocr(node) = node_allocr(node->view_src);
+ SET_CAUSE(node, "3.vsrc");
+ }
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
}
ggml_tallocr_t src_allocr = node_allocr(src);
if (src_allocr == NULL) {
- node_allocr(src) = node_allocr;
+ if (src->view_src != NULL) {
+ // views are always on the same backend as the source
+ node_allocr(src) = node_allocr(src->view_src);
+ SET_CAUSE(src, "3.vsrc");
+ } else {
+ node_allocr(src) = cur_allocr;
+ SET_CAUSE(src, "3.cur");
+ }
}
}
}
- //printf("PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#ifdef DEBUG_PASS3
+ fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#endif
// pass 4: split graph, find tensors that need to be copied
- // TODO:
- // - when switching from a less preferred backend to a more preferred backend, check if it is possible to move the switch to an earlier point for the same cost
- // find first backend
- int cur_split = 0;
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
- if (node->view_src == NULL) {
- sched->splits[0].tallocr = node_allocr(node);
- break;
+ {
+ int cur_split = 0;
+ // find the backend of the first split, skipping view ops
+ for (int i = 0; i < graph->n_nodes; i++) {
+ struct ggml_tensor * node = graph->nodes[i];
+ if (!ggml_is_view_op(node->op)) {
+ sched->splits[0].tallocr = node_allocr(node);
+ break;
+ }
}
- }
- sched->splits[0].i_start = 0;
- sched->splits[0].n_inputs = 0;
- memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
- ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
- size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
- for (int i = 0; i < graph->n_nodes; i++) {
- struct ggml_tensor * node = graph->nodes[i];
+ sched->splits[0].i_start = 0;
+ sched->splits[0].n_inputs = 0;
+ memset(sched->splits[0].inputs, 0, sizeof(sched->splits[0].inputs)); //HACK
+ ggml_tallocr_t cur_allocr = sched->splits[0].tallocr;
+ size_t cur_backend_id = sched_allocr_prio(sched, cur_allocr);
+ for (int i = 0; i < graph->n_nodes; i++) {
+ struct ggml_tensor * node = graph->nodes[i];
+
+ if (ggml_is_view_op(node->op)) {
+ continue;
+ }
- if (ggml_is_view_op(node->op)) {
- continue;
- }
+ ggml_tallocr_t node_allocr = node_allocr(node);
+
+ if (node_allocr != cur_allocr) {
+ sched->splits[cur_split].i_end = i;
+ cur_split++;
+ GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
+ sched->splits[cur_split].tallocr = node_allocr;
+ sched->splits[cur_split].i_start = i;
+ sched->splits[cur_split].n_inputs = 0;
+ cur_allocr = node_allocr;
+ cur_backend_id = sched_allocr_prio(sched, cur_allocr);
+ }
- ggml_tallocr_t node_allocr = node_allocr(node);
+ // find inputs that are not on the same backend
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ struct ggml_tensor * src = node->src[j];
+ if (src == NULL) {
+ break;
+ }
+ ggml_tallocr_t src_allocr = node_allocr(src);
+ GGML_ASSERT(src_allocr != NULL); // all inputs should be assigned by now
+ if (src_allocr != node_allocr) {
+ // check if the input is already in the split
+ bool found = false;
+ for (int k = 0; k < sched->splits[cur_split].n_inputs; k++) {
+ if (sched->splits[cur_split].inputs[k] == src) {
+ found = true;
+ break;
+ }
+ }
- if (node_allocr != cur_allocr) {
- sched->splits[cur_split].i_end = i;
- cur_split++;
- GGML_ASSERT(cur_split < GGML_MAX_SPLITS);
- sched->splits[cur_split].tallocr = node_allocr;
- sched->splits[cur_split].i_start = i;
- sched->splits[cur_split].n_inputs = 0;
- memset(sched->splits[cur_split].inputs, 0, sizeof(sched->splits[cur_split].inputs)); //HACK
- cur_allocr = node_allocr;
- cur_backend_id = sched_allocr_prio(sched, cur_allocr);
- }
+ if (!found) {
+ int n_inputs = sched->splits[cur_split].n_inputs++;
+ //printf("split %d input %d: %s (%s)\n", cur_split, n_inputs, src->name, ggml_backend_name(get_allocr_backend(sched, src_allocr)));
+ GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
+ sched->splits[cur_split].inputs[n_inputs] = src;
+ }
- // find inputs that are not on the same backend
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * src = node->src[j];
- if (src == NULL) {
- break;
- }
- ggml_tallocr_t src_allocr = node_allocr(src);
- if (src_allocr != node_allocr) {
- int n_inputs = sched->splits[cur_split].n_inputs++;
- GGML_ASSERT(n_inputs < GGML_MAX_SPLIT_INPUTS);
- sched->splits[cur_split].inputs[n_inputs] = (struct ggml_tensor *)src;
-
- // create copies
- size_t id = hash_id(src);
- if (sched->node_copies[id][cur_backend_id] == NULL) {
- struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
- sched->node_copies[id][cur_backend_id] = tensor_copy;
- node_allocr(tensor_copy) = cur_allocr;
- ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
- ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
+ // create a copy of the input in the split's backend
+ size_t id = hash_id(src);
+ if (sched->node_copies[id][cur_backend_id] == NULL) {
+ ggml_backend_t backend = get_allocr_backend(sched, cur_allocr);
+ struct ggml_tensor * tensor_copy = ggml_dup_tensor_layout(sched->ctx, src);
+ ggml_format_name(tensor_copy, "%s#%s", ggml_backend_name(backend), src->name);
+
+ sched->node_copies[id][cur_backend_id] = tensor_copy;
+ node_allocr(tensor_copy) = cur_allocr;
+ SET_CAUSE(tensor_copy, "4.cpy");
+ }
+ node->src[j] = sched->node_copies[id][cur_backend_id];
}
- node->src[j] = sched->node_copies[id][cur_backend_id];
}
}
+ sched->splits[cur_split].i_end = graph->n_nodes;
+ sched->n_splits = cur_split + 1;
}
- sched->splits[cur_split].i_end = graph->n_nodes;
- sched->n_splits = cur_split + 1;
-
- //fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph); fflush(stdout);
+#ifdef DEBUG_PASS4
+ fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); sched_print_assignments(sched, graph);
+#endif
-#if 1
+#ifndef NDEBUG
// sanity check: all sources should have the same backend as the node
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
if (node_allocr == NULL) {
fprintf(stderr, "!!!!!!! %s has no backend\n", node->name);
}
+ if (node->view_src != NULL && node_allocr != node_allocr(node->view_src)) {
+ fprintf(stderr, "!!!!!!! %s has backend %s, view_src %s has backend %s\n",
+ node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
+ node->view_src->name, node_allocr(node->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(node->view_src))) : "NULL");
+ }
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src == NULL) {
node->name, node_allocr ? ggml_backend_name(get_allocr_backend(sched, node_allocr)) : "NULL",
j, src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL");
}
+ if (src->view_src != NULL && src_allocr != node_allocr(src->view_src)) {
+ fprintf(stderr, "!!!!!!! [src] %s has backend %s, view_src %s has backend %s\n",
+ src->name, src_allocr ? ggml_backend_name(get_allocr_backend(sched, src_allocr)) : "NULL",
+ src->view_src->name, node_allocr(src->view_src) ? ggml_backend_name(get_allocr_backend(sched, node_allocr(src->view_src))) : "NULL");
+ }
}
}
+ fflush(stderr);
#endif
// create copies of the graph for each split
for (int j = 0; j < split->n_inputs; j++) {
struct ggml_tensor * input = split->inputs[j];
struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_allocr_prio(sched, split->tallocr)];
+ // add a dependency to the input source so that it is not freed before the copy is done
+ GGML_ASSERT(input_cpy->src[0] == NULL || input_cpy->src[0] == input);
input_cpy->src[0] = input;
graph_copy->nodes[graph_copy->n_nodes++] = input_cpy;
}
uint64_t copy_start_us = ggml_time_us();
for (int j = 0; j < split->n_inputs; j++) {
struct ggml_tensor * input = split->inputs[j];
- struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][sched_backend_prio(sched, split_backend)];
- if (input->buffer == NULL) {
- if (input->view_src == NULL) {
- fprintf(stderr, "input %s has no buffer and no view_src\n", input->name);
- exit(1);
- }
- // FIXME: may need to use the sched buffer instead
- ggml_backend_view_init(input->view_src->buffer, input);
- }
- if (input_cpy->buffer == NULL) {
- fprintf(stderr, "input_cpy %s has no buffer\n", input_cpy->name);
- exit(1);
- }
- //GGML_ASSERT(input->buffer->backend != input_cpy->buffer->backend);
- //GGML_ASSERT(input_cpy->buffer->backend == split_backend);
- ggml_backend_tensor_copy(input, input_cpy);
+ struct ggml_tensor * input_cpy = sched->node_copies[hash_id(input)][split_backend_id];
+
+ GGML_ASSERT(input->buffer != NULL);
+ GGML_ASSERT(input_cpy->buffer != NULL);
+
+ // TODO: avoid this copy if it was already copied in a previous split, and the input didn't change
+ // this is important to avoid copying constants such as KQ_mask and inp_pos multiple times
+ ggml_backend_tensor_copy_async(split_backend, input, input_cpy);
}
- // ggml_backend_synchronize(split_backend);
+ //ggml_backend_synchronize(split_backend); // necessary to measure copy time
int64_t copy_end_us = ggml_time_us();
copy_us[split_backend_id] += copy_end_us - copy_start_us;
uint64_t compute_start_us = ggml_time_us();
ggml_backend_graph_compute(split_backend, &split->graph);
- // ggml_backend_synchronize(split_backend);
+ //ggml_backend_synchronize(split_backend); // necessary to measure compute time
uint64_t compute_end_us = ggml_time_us();
compute_us[split_backend_id] += compute_end_us - compute_start_us;
}
for (int i = 0; i < sched->n_backends; i++) {
ggml_tallocr_reset(sched->tallocs[i]);
}
+ // reset state for the next run
+ size_t hash_size = sched->hash_set.size;
+ memset(sched->hash_set.keys, 0, sizeof(sched->hash_set.keys[0]) * hash_size);
+ memset(sched->node_talloc, 0, sizeof(sched->node_talloc[0]) * hash_size);
+ memset(sched->node_copies, 0, sizeof(sched->node_copies[0]) * hash_size);
+
+ sched->is_reset = true;
}
-ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, int n_backends) {
+ggml_backend_sched_t ggml_backend_sched_new(ggml_backend_t * backends, ggml_backend_buffer_type_t * bufts, int n_backends, size_t graph_size) {
+ GGML_ASSERT(n_backends > 0);
GGML_ASSERT(n_backends <= GGML_MAX_BACKENDS);
- struct ggml_backend_sched * sched = malloc(sizeof(struct ggml_backend_sched));
- memset(sched, 0, sizeof(struct ggml_backend_sched));
+ struct ggml_backend_sched * sched = calloc(sizeof(struct ggml_backend_sched), 1);
+
+ // initialize hash table
+ sched->hash_set = ggml_hash_set_new(graph_size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
+ sched->node_talloc = calloc(sizeof(sched->node_talloc[0]) * sched->hash_set.size, 1);
+ sched->node_copies = calloc(sizeof(sched->node_copies[0]) * sched->hash_set.size, 1);
sched->n_backends = n_backends;
for (int i = 0; i < n_backends; i++) {
sched->backends[i] = backends[i];
+ sched->bufts[i] = bufts ? bufts[i] : ggml_backend_get_default_buffer_type(backends[i]);
}
sched->galloc = ggml_gallocr_new();
// init measure allocs for each backend
for (int i = 0; i < n_backends; i++) {
- sched->tallocs[i] = ggml_tallocr_new_measure_from_backend(backends[i]);
+ sched->tallocs[i] = ggml_tallocr_new_measure_from_buft(sched->bufts[i]);
}
+ sched_reset(sched);
+
return sched;
}
ggml_tallocr_free(sched->tallocs[i]);
}
ggml_gallocr_free(sched->galloc);
+ ggml_free(sched->ctx);
free(sched->hash_set.keys);
free(sched->node_talloc);
free(sched->node_copies);
}
void ggml_backend_sched_init_measure(ggml_backend_sched_t sched, struct ggml_cgraph * measure_graph) {
- // initialize hash tables
- size_t hash_size = measure_graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS;
- sched->hash_set.size = hash_size;
- sched->hash_set.keys = malloc(sizeof(sched->hash_set.keys[0]) * hash_size);
- sched->node_talloc = malloc(sizeof(sched->node_talloc[0]) * hash_size);
- sched->node_copies = malloc(sizeof(sched->node_copies[0]) * hash_size);
+ GGML_ASSERT(ggml_tallocr_is_measure(sched->tallocs[0])); // can only be initialized once
sched_split_graph(sched, measure_graph);
sched_alloc_splits(sched);
for (int i = 0; i < sched->n_backends; i++) {
size_t size = ggml_tallocr_max_size(sched->tallocs[i]);
ggml_tallocr_free(sched->tallocs[i]);
- sched->tallocs[i] = ggml_tallocr_new_from_backend(sched->backends[i], size);
+ sched->tallocs[i] = ggml_tallocr_new_from_buft(sched->bufts[i], size);
}
sched_reset(sched);
}
void ggml_backend_sched_graph_compute(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
- GGML_ASSERT(sched->hash_set.size >= graph->visited_hash_table.size + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
+ GGML_ASSERT((int)sched->hash_set.size >= graph->n_nodes + GGML_MAX_SPLITS*GGML_MAX_SPLIT_INPUTS);
+
+ if (!sched->is_reset) {
+ sched_reset(sched);
+ }
sched_split_graph(sched, graph);
sched_alloc_splits(sched);
sched_compute_splits(sched);
+}
+
+void ggml_backend_sched_reset(ggml_backend_sched_t sched) {
sched_reset(sched);
}
+int ggml_backend_sched_get_n_splits(ggml_backend_sched_t sched) {
+ return sched->n_splits;
+}
+
ggml_tallocr_t ggml_backend_sched_get_tallocr(ggml_backend_sched_t sched, ggml_backend_t backend) {
int backend_index = sched_backend_prio(sched, backend);
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
return sched->tallocs[backend_index];
}
ggml_backend_buffer_t ggml_backend_sched_get_buffer(ggml_backend_sched_t sched, ggml_backend_t backend) {
int backend_index = sched_backend_prio(sched, backend);
+ GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
return ggml_tallocr_get_buffer(sched->tallocs[backend_index]);
}
node_allocr(node) = sched->tallocs[backend_index];
}
+ggml_backend_t ggml_backend_sched_get_node_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
+ ggml_tallocr_t allocr = node_allocr(node);
+ if (allocr == NULL) {
+ return NULL;
+ }
+ return get_allocr_backend(sched, allocr);
+}
+
// utils
+
void ggml_backend_view_init(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
GGML_ASSERT(tensor->buffer == NULL);
- //GGML_ASSERT(tensor->data == NULL); // views of pre-allocted tensors may have the data set, but still need to be initialized
+ //GGML_ASSERT(tensor->data == NULL); // views of pre-allocated tensors may have the data set in ggml_new_tensor, but still need to be initialized by the backend
GGML_ASSERT(tensor->view_src != NULL);
GGML_ASSERT(tensor->view_src->buffer != NULL);
GGML_ASSERT(tensor->view_src->data != NULL);
struct ggml_tensor * dst = node_copies[id];
if (dst->view_src != NULL) {
+ graph_init_tensor(hash_set, node_copies, node_init, src->view_src);
ggml_backend_view_init(dst->view_src->buffer, dst);
}
else {
struct ggml_context * ctx_allocated = ggml_init(params);
struct ggml_context * ctx_unallocated = ggml_init(params);
+ if (ctx_allocated == NULL || ctx_unallocated == NULL) {
+ fprintf(stderr, "failed to allocate context for graph copy\n");
+ free(hash_set.keys);
+ free(node_copies);
+ free(node_init);
+ ggml_free(ctx_allocated);
+ ggml_free(ctx_unallocated);
+ return (struct ggml_backend_graph_copy) {
+ /* .buffer = */ NULL,
+ /* .ctx_allocated = */ NULL,
+ /* .ctx_unallocated = */ NULL,
+ /* .graph = */ NULL,
+ };
+ }
+
// dup nodes
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
// allocate nodes
ggml_backend_buffer_t buffer = ggml_backend_alloc_ctx_tensors(ctx_allocated, backend);
+ if (buffer == NULL) {
+ fprintf(stderr, "failed to allocate buffer for graph copy\n");
+ free(hash_set.keys);
+ free(node_copies);
+ free(node_init);
+ ggml_free(ctx_allocated);
+ ggml_free(ctx_unallocated);
+ return (struct ggml_backend_graph_copy) {
+ /* .buffer = */ NULL,
+ /* .ctx_allocated = */ NULL,
+ /* .ctx_unallocated = */ NULL,
+ /* .graph = */ NULL,
+ };
+ }
//printf("copy buffer size: %zu MB\n", ggml_backend_buffer_get_size(buffer) / 1024 / 1024);
ggml_free(copy.ctx_unallocated);
}
-void ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
+bool ggml_backend_compare_graph_backend(ggml_backend_t backend1, ggml_backend_t backend2, struct ggml_cgraph * graph, ggml_backend_eval_callback callback, void * user_data) {
struct ggml_backend_graph_copy copy = ggml_backend_graph_copy(backend2, graph);
+ if (copy.buffer == NULL) {
+ return false;
+ }
+
struct ggml_cgraph * g1 = graph;
struct ggml_cgraph * g2 = copy.graph;
}
ggml_backend_graph_copy_free(copy);
+
+ return true;
}
#include <limits>
#include <stdint.h>
#include <stdio.h>
+#include <string>
#include <vector>
-
+#include <map>
+#include <array>
+#include "ggml-cuda.h"
+#include "ggml.h"
+#include "ggml-backend-impl.h"
#if defined(GGML_USE_HIPBLAS)
#include <hip/hip_runtime.h>
#define cudaMemcpyKind hipMemcpyKind
#define cudaMemset hipMemset
#define cudaMemsetAsync hipMemsetAsync
+#define cudaMemGetInfo hipMemGetInfo
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
#define cudaSetDevice hipSetDevice
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#endif // defined(GGML_USE_HIPBLAS)
-#include "ggml-cuda.h"
-#include "ggml.h"
-#include "ggml-backend-impl.h"
-
#define CUDART_HMAX 11070 // CUDA 11.7, min. ver. for which __hmax and __hmax2 are known to work (may be higher than needed)
#define CC_PASCAL 600
static int g_device_count = -1;
static int g_main_device = 0;
-static float g_tensor_split[GGML_CUDA_MAX_DEVICES] = {0};
+static std::array<float, GGML_CUDA_MAX_DEVICES> g_default_tensor_split = {};
struct cuda_device_capabilities {
int cc; // compute capability
static cuda_device_capabilities g_device_caps[GGML_CUDA_MAX_DEVICES] = { {0, 0, false, 0} };
-static void * g_scratch_buffer = nullptr;
-static size_t g_scratch_size = 0; // disabled by default
-static size_t g_scratch_offset = 0;
-
static cublasHandle_t g_cublas_handles[GGML_CUDA_MAX_DEVICES] = {nullptr};
[[noreturn]]
CUDA_CHECK(cudaGetDeviceProperties(&prop, id));
fprintf(stderr, " Device %d: %s, compute capability %d.%d, VMM: %s\n", id, prop.name, prop.major, prop.minor, device_vmm ? "yes" : "no");
- g_tensor_split[id] = total_vram;
+ g_default_tensor_split[id] = total_vram;
total_vram += prop.totalGlobalMem;
+
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
g_device_caps[id].cc = 100*prop.major + 10*prop.minor + CC_OFFSET_AMD;
#else
g_device_caps[id].smpb = prop.sharedMemPerBlock;
}
for (int id = 0; id < g_device_count; ++id) {
- g_tensor_split[id] /= total_vram;
+ g_default_tensor_split[id] /= total_vram;
}
for (int id = 0; id < g_device_count; ++id) {
}
}
-void ggml_cuda_set_tensor_split(const float * tensor_split) {
- if (tensor_split == nullptr) {
- return;
- }
- bool all_zero = true;
- for (int i = 0; i < g_device_count; ++i) {
- if (tensor_split[i] != 0.0f) {
- all_zero = false;
- break;
- }
- }
- if (all_zero) {
- return;
- }
- float split_sum = 0.0f;
- for (int i = 0; i < g_device_count; ++i) {
- g_tensor_split[i] = split_sum;
- split_sum += tensor_split[i];
- }
- for (int i = 0; i < g_device_count; ++i) {
- g_tensor_split[i] /= split_sum;
- }
-}
-
void * ggml_cuda_host_malloc(size_t size) {
if (getenv("GGML_CUDA_NO_PINNED") != nullptr) {
return nullptr;
(void) src1_ddf_i;
}
-static int64_t get_row_rounding(ggml_type type) {
+static int64_t get_row_rounding(ggml_type type, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split) {
int64_t min_compute_capability = INT_MAX;
int64_t max_compute_capability = INT_MIN;
for (int id = 0; id < g_device_count; ++id) {
- if (g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
+ if (tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
if (min_compute_capability > g_device_caps[id].cc) {
min_compute_capability = g_device_caps[id].cc;
}
#endif // defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
}
+static void get_row_split(int64_t * row_low, int64_t * row_high, const ggml_tensor * tensor, const std::array<float, GGML_CUDA_MAX_DEVICES> & tensor_split, int id) {
+ const int64_t nrows = ggml_nrows(tensor);
+ const int64_t rounding = get_row_rounding(tensor->type, tensor_split);
+
+ *row_low = id == 0 ? 0 : nrows*tensor_split[id];
+ *row_low -= *row_low % rounding;
+
+ if (id == g_device_count - 1) {
+ *row_high = nrows;
+ } else {
+ *row_high = nrows*tensor_split[id + 1];
+ *row_high -= *row_high % rounding;
+ }
+}
+
static void ggml_cuda_op_mul_mat_vec_q(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, const char * src0_dd_i, const float * src1_ddf_i,
const char * src1_ddq_i, float * dst_dd_i, const int64_t row_low, const int64_t row_high, const int64_t src1_ncols,
peer_access_enabled = enable_peer_access;
}
+// FIXME: move this somewhere else
+struct ggml_backend_cuda_split_buffer_type_context {
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
+};
+
static void ggml_cuda_op_mul_mat(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst, ggml_cuda_op_mul_mat_t op,
const bool convert_src1_to_q8_1) {
GGML_ASSERT(!(split && ne03 > 1));
GGML_ASSERT(!(split && ne02 < ne12));
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
+ if (split) {
+ // TODO: check that src0->buffer->buft is a split buffer type, replace GGML_BACKEND_GPU_SPLIT check
+ // GGML_ASSERT(src0->buffer != nullptr && src0->buffer->buft == ...);
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
+ tensor_split = buft_ctx->tensor_split;
+ }
+
struct dev_data {
cuda_pool_alloc<char> src0_dd_alloc;
cuda_pool_alloc<float> src1_ddf_alloc;
// for multi GPU, get the row boundaries from tensor split
// and round to mul_mat_q tile sizes
if (split) {
- const int64_t rounding = get_row_rounding(src0->type);
+ const int64_t rounding = get_row_rounding(src0->type, tensor_split);
if (id != 0) {
- dev[id].row_low = ne01*g_tensor_split[id];
+ dev[id].row_low = ne01*tensor_split[id];
if (dev[id].row_low < ne01) {
dev[id].row_low -= dev[id].row_low % rounding;
}
}
if (id != g_device_count - 1) {
- dev[id].row_high = ne01*g_tensor_split[id + 1];
+ dev[id].row_high = ne01*tensor_split[id + 1];
if (dev[id].row_high < ne01) {
dev[id].row_high -= dev[id].row_high % rounding;
}
const bool split = src0->backend == GGML_BACKEND_GPU_SPLIT;
int64_t min_compute_capability = INT_MAX;
- for (int id = 0; id < g_device_count; ++id) {
- if (min_compute_capability > g_device_caps[id].cc && g_tensor_split[id] < (id + 1 < g_device_count ? g_tensor_split[id + 1] : 1.0f)) {
- min_compute_capability = g_device_caps[id].cc;
+
+ if (split) {
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *) src0->buffer->buft->context;
+ auto & tensor_split = buft_ctx->tensor_split;
+ for (int id = 0; id < g_device_count; ++id) {
+ if (min_compute_capability > g_device_caps[id].cc && tensor_split[id] < (id + 1 < g_device_count ? tensor_split[id + 1] : 1.0f)) {
+ min_compute_capability = g_device_caps[id].cc;
+ }
}
+ } else {
+ min_compute_capability = g_device_caps[g_main_device].cc;
}
#if defined(GGML_USE_HIPBLAS) && defined(__HIP_PLATFORM_AMD__)
} else if (!split && all_on_device && !fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_contiguous(src0) && !ggml_is_transposed(src1) && src1->ne[1] == 1) {
// KQV single-batch
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
- } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1)) {
+ } else if (!split && all_on_device && fp16_performance_good && src0->type == GGML_TYPE_F16 && !ggml_is_transposed(src0) && !ggml_is_transposed(src1) && src1->ne[2]*src1->ne[3] > 1) {
// KQ + KQV multi-batch
ggml_cuda_mul_mat_mat_batched_cublas(src0, src1, dst);
} else if (src0->type == GGML_TYPE_F32) {
return nrows_split*ggml_row_size(tensor->type, tensor->ne[0]);
}
-void ggml_cuda_transform_tensor(void * data, struct ggml_tensor * tensor) {
- const int64_t nrows = ggml_nrows(tensor);
-
- const int64_t ne0 = tensor->ne[0];
-
- const size_t nb1 = tensor->nb[1];
-
- ggml_backend_type backend = tensor->backend;
- ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
- memset(extra, 0, sizeof(*extra));
-
- for (int id = 0; id < g_device_count; ++id) {
- if (backend == GGML_BACKEND_GPU && id != g_main_device) {
- continue;
- }
-
- ggml_cuda_set_device(id);
-
- int64_t row_low, row_high;
- if (backend == GGML_BACKEND_GPU) {
- row_low = 0;
- row_high = nrows;
- } else if (backend == GGML_BACKEND_GPU_SPLIT) {
- const int64_t rounding = get_row_rounding(tensor->type);
-
- row_low = id == 0 ? 0 : nrows*g_tensor_split[id];
- row_low -= row_low % rounding;
-
- if (id == g_device_count - 1) {
- row_high = nrows;
- } else {
- row_high = nrows*g_tensor_split[id + 1];
- row_high -= row_high % rounding;
- }
- } else {
- GGML_ASSERT(false);
- }
- if (row_low == row_high) {
- continue;
- }
-
- int64_t nrows_split = row_high - row_low;
-
- const size_t offset_split = row_low*nb1;
- size_t size = ggml_nbytes_split(tensor, nrows_split);
- const size_t original_size = size;
-
- // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
- if (ne0 % MATRIX_ROW_PADDING != 0) {
- size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
- }
-
- char * buf;
- CUDA_CHECK(cudaMalloc(&buf, size));
- char * buf_host = (char *)data + offset_split;
-
- // set padding to 0 to avoid possible NaN values
- if (size > original_size) {
- CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
- }
-
- CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
-
- extra->data_device[id] = buf;
-
- if (backend == GGML_BACKEND_GPU_SPLIT) {
- for (int64_t is = 0; is < MAX_STREAMS; ++is) {
- CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
- }
- }
- }
-
- tensor->extra = extra;
-}
-
-void ggml_cuda_free_data(struct ggml_tensor * tensor) {
- if (!tensor || !tensor->extra || (tensor->backend != GGML_BACKEND_GPU && tensor->backend != GGML_BACKEND_GPU_SPLIT) ) {
- return;
- }
-
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
-
- for (int id = 0; id < g_device_count; ++id) {
- ggml_cuda_set_device(id);
- if (extra->data_device[id] != nullptr) {
- CUDA_CHECK(cudaFree(extra->data_device[id]));
- }
-
- for (int64_t is = 0; is < MAX_STREAMS; ++is) {
- if (extra->events[id][is] != nullptr) {
- CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
- }
- }
- }
-
- delete extra;
-}
-
-static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
-static size_t g_temp_tensor_extra_index = 0;
-
-static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
- if (g_temp_tensor_extras == nullptr) {
- g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
- }
-
- size_t alloc_index = g_temp_tensor_extra_index;
- g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_CUDA_MAX_NODES;
- ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
- memset(extra, 0, sizeof(*extra));
-
- return extra;
-}
-
-static void ggml_cuda_assign_buffers_impl(struct ggml_tensor * tensor, bool scratch, bool force_inplace, bool no_alloc) {
- if (scratch && g_scratch_size == 0) {
- return;
- }
-
- tensor->backend = GGML_BACKEND_GPU;
-
- // recursively assign CUDA buffers until a compute tensor is found
- if (tensor->src[0] != nullptr && tensor->src[0]->backend == GGML_BACKEND_CPU) {
- const ggml_op src0_op = tensor->src[0]->op;
- if (src0_op == GGML_OP_RESHAPE || src0_op == GGML_OP_TRANSPOSE || src0_op == GGML_OP_VIEW || src0_op == GGML_OP_PERMUTE) {
- ggml_cuda_assign_buffers_impl(tensor->src[0], scratch, force_inplace, no_alloc);
- }
- }
- if (tensor->op == GGML_OP_CPY && tensor->src[1]->backend == GGML_BACKEND_CPU) {
- ggml_cuda_assign_buffers_impl(tensor->src[1], scratch, force_inplace, no_alloc);
- }
-
- if (scratch && no_alloc) {
- return;
- }
-
- ggml_tensor_extra_gpu * extra;
-
- const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
- tensor->op == GGML_OP_VIEW ||
- force_inplace;
- const size_t size = ggml_nbytes(tensor);
-
- ggml_cuda_set_device(g_main_device);
- if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
- char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
- size_t offset = 0;
- if (tensor->op == GGML_OP_VIEW) {
- memcpy(&offset, tensor->op_params, sizeof(size_t));
- }
- extra = ggml_cuda_alloc_temp_tensor_extra();
- extra->data_device[g_main_device] = src0_ddc + offset;
- } else if (tensor->op == GGML_OP_CPY) {
- ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
- void * src1_ddv = src1_extra->data_device[g_main_device];
- extra = ggml_cuda_alloc_temp_tensor_extra();
- extra->data_device[g_main_device] = src1_ddv;
- } else if (scratch) {
- GGML_ASSERT(size <= g_scratch_size);
- if (g_scratch_offset + size > g_scratch_size) {
- g_scratch_offset = 0;
- }
-
- char * data = (char *) g_scratch_buffer;
- if (data == nullptr) {
- CUDA_CHECK(cudaMalloc(&data, g_scratch_size));
- g_scratch_buffer = data;
- }
- extra = ggml_cuda_alloc_temp_tensor_extra();
- extra->data_device[g_main_device] = data + g_scratch_offset;
-
- g_scratch_offset += size;
-
- GGML_ASSERT(g_scratch_offset <= g_scratch_size);
- } else { // allocate new buffers outside of scratch
- void * data;
- CUDA_CHECK(cudaMalloc(&data, size));
- CUDA_CHECK(cudaMemset(data, 0, size));
- extra = new ggml_tensor_extra_gpu;
- memset(extra, 0, sizeof(*extra));
- extra->data_device[g_main_device] = data;
- }
-
- tensor->extra = extra;
-}
-
-void ggml_cuda_assign_scratch_offset(struct ggml_tensor * tensor, size_t offset) {
- if (g_scratch_size == 0) {
- return;
- }
- if (g_scratch_buffer == nullptr) {
- ggml_cuda_set_device(g_main_device);
- CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
- }
-
- ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
-
- const bool inplace = tensor->view_src != nullptr;
-
- if (inplace && (tensor->view_src->backend == GGML_BACKEND_GPU || tensor->view_src->backend == GGML_BACKEND_GPU_SPLIT)) {
- ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->view_src->extra;
- char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
- size_t view_offset = 0;
- if (tensor->op == GGML_OP_VIEW) {
- memcpy(&view_offset, tensor->op_params, sizeof(size_t));
- }
- extra->data_device[g_main_device] = src0_ddc + view_offset;
- } else {
- extra->data_device[g_main_device] = (char *) g_scratch_buffer + offset;
- }
-
- tensor->extra = extra;
-}
-
-void ggml_cuda_copy_to_device(struct ggml_tensor * tensor) {
- GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
- GGML_ASSERT(ggml_is_contiguous(tensor));
-
- ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
- ggml_cuda_set_device(g_main_device);
- CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
-}
-
-void ggml_cuda_assign_buffers(struct ggml_tensor * tensor) {
- ggml_cuda_assign_buffers_impl(tensor, true, false, false);
-}
-
-void ggml_cuda_assign_buffers_no_alloc(struct ggml_tensor * tensor) {
- ggml_cuda_assign_buffers_impl(tensor, true, false, true);
-}
-
-void ggml_cuda_assign_buffers_no_scratch(struct ggml_tensor * tensor) {
- ggml_cuda_assign_buffers_impl(tensor, false, false, false);
-}
-
-void ggml_cuda_assign_buffers_force_inplace(struct ggml_tensor * tensor) {
- ggml_cuda_assign_buffers_impl(tensor, false, true, false);
-}
-
-void ggml_cuda_set_main_device(const int main_device) {
+static void ggml_cuda_set_main_device(const int main_device) {
if (main_device >= g_device_count) {
fprintf(stderr, "warning: cannot set main_device=%d because there are only %d devices. Using device %d instead.\n",
main_device, g_device_count, g_main_device);
if (g_main_device != main_device && g_device_count > 1) {
g_main_device = main_device;
- cudaDeviceProp prop;
- CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
- fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
- }
-}
-
-void ggml_cuda_set_scratch_size(const size_t scratch_size) {
- // this is a hack to not completely break llama.cpp when using multiple models or contexts simultaneously
- // it still won't always work as expected, but it's better than nothing
- if (scratch_size > g_scratch_size) {
- ggml_cuda_free_scratch();
- }
- g_scratch_size = std::max(g_scratch_size, scratch_size);
-}
-
-void ggml_cuda_free_scratch() {
- if (g_scratch_buffer == nullptr) {
- return;
+ //cudaDeviceProp prop;
+ //CUDA_CHECK(cudaGetDeviceProperties(&prop, g_main_device));
+ //fprintf(stderr, "%s: using device %d (%s) as main device\n", __func__, g_main_device, prop.name);
}
-
- CUDA_CHECK(cudaFree(g_scratch_buffer));
- g_scratch_buffer = nullptr;
}
bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
#define UNUSED GGML_UNUSED
+struct ggml_backend_cuda_context {
+ int device;
+ std::string name;
+};
+
// cuda buffer
-struct ggml_backend_buffer_context_cuda {
+struct ggml_backend_cuda_buffer_context {
int device;
void * dev_ptr = nullptr;
ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
size_t temp_tensor_extra_index = 0;
+ std::string name;
- ggml_backend_buffer_context_cuda(int device, void * dev_ptr) : device(device), dev_ptr(dev_ptr) {}
+ ggml_backend_cuda_buffer_context(int device, void * dev_ptr) :
+ device(device), dev_ptr(dev_ptr),
+ name(GGML_CUDA_NAME + std::to_string(device)) {
+ }
- ~ggml_backend_buffer_context_cuda() {
+ ~ggml_backend_cuda_buffer_context() {
delete[] temp_tensor_extras;
}
ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+ // TODO: remove GGML_CUDA_MAX_NODES, allocate dynamically and reuse in backend_buffer_reset
if (temp_tensor_extras == nullptr) {
temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_CUDA_MAX_NODES];
}
}
};
+static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
+ return ctx->name.c_str();
+}
+
+static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
+ return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
+}
+
static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
CUDA_CHECK(cudaFree(ctx->dev_ptr));
delete ctx;
}
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
return ctx->dev_ptr;
}
static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
if (tensor->view_src != NULL && tensor->view_offs == 0) {
assert(tensor->view_src->buffer->buft == buffer->buft);
CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[ctx->device][0]));
}
}
-
- UNUSED(buffer);
}
static void ggml_backend_cuda_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaDeviceSynchronize());
static void ggml_backend_cuda_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaDeviceSynchronize());
-
CUDA_CHECK(cudaMemcpy(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost));
+ CUDA_CHECK(cudaDeviceSynchronize());
+}
+
+static bool ggml_backend_cuda_buffer_cpy_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * src, ggml_tensor * dst) {
+ if (ggml_backend_buffer_is_cuda(src->buffer)) {
+ ggml_backend_cuda_buffer_context * src_ctx = (ggml_backend_cuda_buffer_context *)src->buffer->context;
+ ggml_backend_cuda_buffer_context * dst_ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
+
+ ggml_cuda_set_device(src_ctx->device);
+ CUDA_CHECK(cudaDeviceSynchronize());
+ ggml_cuda_set_device(dst_ctx->device);
+ CUDA_CHECK(cudaDeviceSynchronize());
+ CUDA_CHECK(cudaMemcpy((char *)dst->data, (const char *)src->data, ggml_nbytes(src), cudaMemcpyDeviceToDevice));
+ CUDA_CHECK(cudaDeviceSynchronize());
+
+ return true;
+ }
+ return false;
}
static void ggml_backend_cuda_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
- ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+ ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
ggml_cuda_set_device(ctx->device);
CUDA_CHECK(cudaDeviceSynchronize());
-
CUDA_CHECK(cudaMemset(ctx->dev_ptr, value, buffer->size));
+ CUDA_CHECK(cudaDeviceSynchronize());
}
-static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
+static ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
+ /* .get_name = */ ggml_backend_cuda_buffer_get_name,
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
/* .set_tensor = */ ggml_backend_cuda_buffer_set_tensor,
/* .get_tensor = */ ggml_backend_cuda_buffer_get_tensor,
- /* .cpy_tensor_from = */ NULL,
- /* .cpy_tensor_to = */ NULL,
+ /* .cpy_tensor = */ ggml_backend_cuda_buffer_cpy_tensor,
/* .clear = */ ggml_backend_cuda_buffer_clear,
+ /* .reset = */ NULL,
};
// cuda buffer type
+struct ggml_backend_cuda_buffer_type_context {
+ int device;
+ std::string name;
+};
+
+static const char * ggml_backend_cuda_buffer_type_name(ggml_backend_buffer_type_t buft) {
+ ggml_backend_cuda_buffer_type_context * ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
+
+ return ctx->name.c_str();
+}
+
static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
- int device = (int) (intptr_t) buft->context;
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
- ggml_cuda_set_device(device);
+ ggml_cuda_set_device(buft_ctx->device);
size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
void * dev_ptr;
- CUDA_CHECK(cudaMalloc(&dev_ptr, size));
+ cudaError_t err = cudaMalloc(&dev_ptr, size);
+ if (err != cudaSuccess) {
+ fprintf(stderr, "%s: allocating %.2f MiB on device %d: cudaMalloc failed: %s\n", __func__, size/1024.0/1024.0, buft_ctx->device, cudaGetErrorString(err));
+ return nullptr;
+ }
- ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda(device, dev_ptr);
+ ggml_backend_cuda_buffer_context * ctx = new ggml_backend_cuda_buffer_context(buft_ctx->device, dev_ptr);
- return ggml_backend_buffer_init(buft, cuda_backend_buffer_interface, ctx, size);
+ return ggml_backend_buffer_init(buft, ggml_backend_cuda_buffer_interface, ctx, size);
}
static size_t ggml_backend_cuda_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
UNUSED(buft);
}
-static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, ggml_tensor * tensor) {
+static size_t ggml_backend_cuda_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
int64_t row_low = 0;
int64_t row_high = ggml_nrows(tensor);
int64_t nrows_split = row_high - row_low;
}
static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
- return ggml_backend_is_cuda(backend);
+ if (!ggml_backend_is_cuda(backend)) {
+ return false;
+ }
- UNUSED(buft);
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+
+ return buft_ctx->device == cuda_ctx->device;
}
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
+ /* .get_name = */ ggml_backend_cuda_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
- /* .is_host = */ nullptr,
+ /* .is_host = */ NULL,
};
ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device) {
- static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
+ // FIXME: this is not thread safe
+ if (device >= ggml_backend_cuda_get_device_count()) {
+ return nullptr;
+ }
+
+ static ggml_backend_buffer_type ggml_backend_cuda_buffer_types[GGML_CUDA_MAX_DEVICES];
static bool ggml_backend_cuda_buffer_type_initialized = false;
for (int i = 0; i < GGML_CUDA_MAX_DEVICES; i++) {
ggml_backend_cuda_buffer_types[i] = {
/* .iface = */ ggml_backend_cuda_buffer_type_interface,
- /* .context = */ (ggml_backend_buffer_type_context_t) (intptr_t) i,
+ /* .context = */ new ggml_backend_cuda_buffer_type_context{i, GGML_CUDA_NAME + std::to_string(i)},
};
}
ggml_backend_cuda_buffer_type_initialized = true;
return &ggml_backend_cuda_buffer_types[device];
}
+// cuda split buffer
+
+struct ggml_backend_cuda_split_buffer_context {
+ ~ggml_backend_cuda_split_buffer_context() {
+ for (ggml_tensor_extra_gpu * extra : tensor_extras) {
+ for (int id = 0; id < g_device_count; ++id) {
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
+ if (extra->events[id][is] != nullptr) {
+ CUDA_CHECK(cudaEventDestroy(extra->events[id][is]));
+ }
+ }
+ if (extra->data_device[id] != nullptr) {
+ CUDA_CHECK(cudaFree(extra->data_device[id]));
+ }
+ }
+ delete extra;
+ }
+ }
+
+ std::vector<ggml_tensor_extra_gpu *> tensor_extras;
+};
+
+static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
+ return GGML_CUDA_NAME "_Split";
+
+ UNUSED(buffer);
+}
+
+// unused at the moment
+//static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
+// return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
+//}
+
+static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+ ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
+ delete ctx;
+}
+
+static void * ggml_backend_cuda_split_buffer_get_base(ggml_backend_buffer_t buffer) {
+ // the pointers are stored in the tensor extras, this is just a dummy address and never dereferenced
+ return (void *)0x1000;
+
+ UNUSED(buffer);
+}
+
+static void ggml_backend_cuda_split_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+ GGML_ASSERT(tensor->view_src == nullptr); // views of split tensors are not supported
+
+ ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
+
+ const int64_t ne0 = tensor->ne[0];
+
+ ggml_tensor_extra_gpu * extra = new ggml_tensor_extra_gpu{};
+
+ ctx->tensor_extras.push_back(extra);
+
+ for (int id = 0; id < g_device_count; ++id) {
+ int64_t row_low, row_high;
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+ int64_t nrows_split = row_high - row_low;
+ if (nrows_split == 0) {
+ continue;
+ }
+
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
+ const size_t original_size = size;
+
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+ }
+
+ // FIXME: do not crash if cudaMalloc fails
+ // currently, init_tensor cannot fail, it needs to be fixed in ggml-backend first
+ ggml_cuda_set_device(id);
+ char * buf;
+ CUDA_CHECK(cudaMalloc(&buf, size));
+
+ // set padding to 0 to avoid possible NaN values
+ if (size > original_size) {
+ CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
+ }
+
+ extra->data_device[id] = buf;
+
+ for (int64_t is = 0; is < MAX_STREAMS; ++is) {
+ CUDA_CHECK(cudaEventCreateWithFlags(&extra->events[id][is], cudaEventDisableTiming));
+ }
+ }
+ tensor->backend = GGML_BACKEND_GPU_SPLIT;
+ tensor->extra = extra;
+}
+
+static void ggml_backend_cuda_split_buffer_set_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+ // split tensors must always be set in their entirety at once
+ GGML_ASSERT(offset == 0);
+ GGML_ASSERT(size == ggml_nbytes(tensor));
+
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
+
+ const int64_t ne0 = tensor->ne[0];
+ const size_t nb1 = tensor->nb[1];
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
+
+ for (int id = 0; id < g_device_count; ++id) {
+ int64_t row_low, row_high;
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+ int64_t nrows_split = row_high - row_low;
+ if (nrows_split == 0) {
+ continue;
+ }
+
+ const size_t offset_split = row_low*nb1;
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
+ const size_t original_size = size;
+
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+ }
+
+ const char * buf_host = (const char *)data + offset_split;
+ CUDA_CHECK(cudaMemcpy(extra->data_device[id], buf_host, original_size, cudaMemcpyHostToDevice));
+ }
+}
+
+static void ggml_backend_cuda_split_buffer_get_tensor(ggml_backend_buffer_t buffer, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+ // split tensors must always be set in their entirety at once
+ GGML_ASSERT(offset == 0);
+ GGML_ASSERT(size == ggml_nbytes(tensor));
+
+ ggml_backend_cuda_split_buffer_type_context * buft_ctx = (ggml_backend_cuda_split_buffer_type_context *)buffer->buft->context;
+
+ const int64_t ne0 = tensor->ne[0];
+ const size_t nb1 = tensor->nb[1];
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *)tensor->extra;
+
+ for (int id = 0; id < g_device_count; ++id) {
+ int64_t row_low, row_high;
+ get_row_split(&row_low, &row_high, tensor, buft_ctx->tensor_split, id);
+
+ int64_t nrows_split = row_high - row_low;
+ if (nrows_split == 0) {
+ continue;
+ }
+
+ const size_t offset_split = row_low*nb1;
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
+ const size_t original_size = size;
+
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
+ size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+ }
+
+ char * buf_host = (char *)data + offset_split;
+ CUDA_CHECK(cudaMemcpy(buf_host, extra->data_device[id], original_size, cudaMemcpyDeviceToHost));
+ }
+}
+
+static void ggml_backend_cuda_split_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+ UNUSED(buffer);
+ UNUSED(value);
+}
+
+static struct ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
+ /* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
+ /* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
+ /* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
+ /* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
+ /* .set_tensor = */ ggml_backend_cuda_split_buffer_set_tensor,
+ /* .get_tensor = */ ggml_backend_cuda_split_buffer_get_tensor,
+ /* .cpy_tensor = */ NULL,
+ /* .clear = */ ggml_backend_cuda_split_buffer_clear,
+ /* .reset = */ NULL,
+};
+
+// cuda split buffer type
+
+static const char * ggml_backend_cuda_split_buffer_type_name(ggml_backend_buffer_type_t buft) {
+ return GGML_CUDA_NAME "_Split";
+
+ UNUSED(buft);
+}
+
+static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+ // since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
+ // instead, we allocate them for each tensor separately in init_tensor
+ // however, the size still represents the maximum cumulative size of all the device buffers after the tensors are allocated,
+ // as returned by get_alloc_size. this limit is enforced during tensor allocation by ggml-alloc, so it must be correct.
+ ggml_backend_cuda_split_buffer_context * ctx = new ggml_backend_cuda_split_buffer_context();
+
+ return ggml_backend_buffer_init(buft, ggml_backend_cuda_split_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_cuda_split_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
+ return 128;
+
+ UNUSED(buft);
+}
+
+static size_t ggml_backend_cuda_split_buffer_type_get_alloc_size(ggml_backend_buffer_type_t buft, const ggml_tensor * tensor) {
+ ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
+
+ size_t total_size = 0;
+
+ const int64_t ne0 = tensor->ne[0];
+
+ for (int id = 0; id < g_device_count; ++id) {
+ int64_t row_low, row_high;
+ get_row_split(&row_low, &row_high, tensor, ctx->tensor_split, id);
+
+ int64_t nrows_split = row_high - row_low;
+ if (nrows_split == 0) {
+ continue;
+ }
+
+ total_size += ggml_nbytes_split(tensor, nrows_split);
+
+ // pad last row to a multiple of 512 elements to avoid out-of-bounds memory accesses
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
+ total_size += ggml_row_size(tensor->type, MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING);
+ }
+ }
+
+ return total_size;
+}
+
+static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
+ return ggml_backend_is_cuda(backend);
+
+ UNUSED(buft);
+}
+
+static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
+ return false;
+
+ UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_i ggml_backend_cuda_split_buffer_type_interface = {
+ /* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
+ /* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
+ /* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
+ /* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
+ /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
+ /* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
+};
+
+ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
+ // FIXME: this is not thread safe
+ static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
+
+ std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
+
+ bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + GGML_CUDA_MAX_DEVICES, [](float x) { return x == 0.0f; });
+ if (all_zero) {
+ tensor_split_arr = g_default_tensor_split;
+ } else {
+ float split_sum = 0.0f;
+ for (int i = 0; i < g_device_count; ++i) {
+ tensor_split_arr[i] = split_sum;
+ split_sum += tensor_split[i];
+ }
+ for (int i = 0; i < g_device_count; ++i) {
+ tensor_split_arr[i] /= split_sum;
+ }
+ }
+
+ auto it = buft_map.find(tensor_split_arr);
+ if (it != buft_map.end()) {
+ return &it->second;
+ }
+
+ struct ggml_backend_buffer_type buft {
+ /* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
+ /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
+ };
+
+ auto result = buft_map.emplace(tensor_split_arr, buft);
+ return &result.first->second;
+}
+
// host buffer type
+static const char * ggml_backend_cuda_host_buffer_type_name(ggml_backend_buffer_type_t buft) {
+ return GGML_CUDA_NAME "_Host";
+
+ UNUSED(buft);
+}
+
+static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
+ return GGML_CUDA_NAME "_Host";
+
+ UNUSED(buffer);
+}
+
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_cuda_host_free(buffer->context);
}
return ggml_backend_buft_alloc_buffer(ggml_backend_cpu_buffer_type(), size);
}
- // FIXME: this is a hack to avoid having to implement a new buffer type
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft;
+ buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
return buffer;
ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_cuda_buffer_type_host = {
/* .iface = */ {
+ /* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
// backend
-struct ggml_backend_context_cuda {
- int device;
-};
-
static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
- return GGML_CUDA_NAME;
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
- UNUSED(backend);
+ return cuda_ctx->name.c_str();
}
static void ggml_backend_cuda_free(ggml_backend_t backend) {
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
delete cuda_ctx;
delete backend;
}
static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
return ggml_backend_cuda_buffer_type(cuda_ctx->device);
}
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
}
static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
GGML_ASSERT(tensor->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && "unsupported buffer type");
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[cuda_ctx->device][0]));
}
-static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
-
- CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
-
- UNUSED(backend);
-}
-
-static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
- GGML_ASSERT(!"not implemented");
+static bool ggml_backend_cuda_cpy_tensor_async(ggml_backend_t backend, const ggml_tensor * src, ggml_tensor * dst) {
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
- return nullptr;
+ if (dst->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) && ggml_backend_buffer_is_cuda(src->buffer)) {
+ CUDA_CHECK(cudaMemcpyAsync(dst->data, src->data, ggml_nbytes(dst), cudaMemcpyDeviceToDevice, g_cudaStreams[cuda_ctx->device][0]));
+ return true;
+ }
- UNUSED(backend);
- UNUSED(cgraph);
+ return false;
}
-static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
- GGML_ASSERT(!"not implemented");
-
- UNUSED(backend);
- UNUSED(plan);
-}
+static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
- GGML_ASSERT(!"not implemented");
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[cuda_ctx->device][0]));
UNUSED(backend);
- UNUSED(plan);
}
static bool ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
- ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_cuda_set_main_device(cuda_ctx->device);
for (int i = 0; i < cgraph->n_nodes; i++) {
ggml_tensor * node = cgraph->nodes[i];
- if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE)
+ if (node->op == GGML_OP_RESHAPE || node->op == GGML_OP_TRANSPOSE || node->op == GGML_OP_VIEW || node->op == GGML_OP_PERMUTE || node->op == GGML_OP_NONE) {
continue;
+ }
- assert(node->backend == GGML_BACKEND_GPU);
+#ifndef NDEBUG
+ assert(node->backend == GGML_BACKEND_GPU || node->backend == GGML_BACKEND_GPU_SPLIT);
assert(node->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
assert(node->extra != nullptr);
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] != nullptr) {
- assert(node->src[j]->backend == GGML_BACKEND_GPU);
+ assert(node->src[j]->backend == GGML_BACKEND_GPU || node->src[j]->backend == GGML_BACKEND_GPU_SPLIT);
assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device));
assert(node->src[j]->extra != nullptr);
}
}
+#endif
bool ok = ggml_cuda_compute_forward(¶ms, node);
if (!ok) {
fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
}
GGML_ASSERT(ok);
-
-#if 0
- if (node->type == GGML_TYPE_F32) {
- cudaDeviceSynchronize();
- std::vector<float> tmp(ggml_nelements(node), 0.0f);
- cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
- printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
- ggml_type_name(node->src[0]->type),
- node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
- node->src[0]->name,
- node->src[1] ? node->src[1]->name : "none");
- double sum = 0.0;
- double sq_sum = 0.0;
- for (int i = 0; i < ggml_nelements(node); i++) {
- printf("%f ", tmp[i]);
- sum += tmp[i];
- sq_sum += tmp[i]*tmp[i];
- }
- printf("\n");
- printf("sum: %f, ", sum);
- printf("sq_sum: %f\n", sq_sum);
- }
-#endif
}
- UNUSED(backend);
-
return true;
}
UNUSED(backend);
}
-static ggml_backend_i cuda_backend_i = {
+static ggml_backend_i ggml_backend_cuda_interface = {
/* .get_name = */ ggml_backend_cuda_name,
/* .free = */ ggml_backend_cuda_free,
/* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
- /* .cpy_tensor_from_async = */ NULL,
- /* .cpy_tensor_to_async = */ NULL,
+ /* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
/* .synchronize = */ ggml_backend_cuda_synchronize,
- /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
- /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
- /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
+ /* .graph_plan_create = */ NULL,
+ /* .graph_plan_free = */ NULL,
+ /* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .supports_op = */ ggml_backend_cuda_supports_op,
};
// not strictly necessary, but it may reduce the overhead of the first graph_compute
ggml_cuda_set_main_device(device);
- ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda {
- /* .device = */ device
+ ggml_backend_cuda_context * ctx = new ggml_backend_cuda_context {
+ /* .device = */ device,
+ /* .name = */ GGML_CUDA_NAME + std::to_string(device),
};
ggml_backend_t cuda_backend = new ggml_backend {
- /* .interface = */ cuda_backend_i,
+ /* .interface = */ ggml_backend_cuda_interface,
/* .context = */ ctx
};
}
bool ggml_backend_is_cuda(ggml_backend_t backend) {
- return backend->iface.get_name == ggml_backend_cuda_name;
+ return backend && backend->iface.get_name == ggml_backend_cuda_name;
+}
+
+int ggml_backend_cuda_get_device_count() {
+ return ggml_cuda_get_device_count();
+}
+
+void ggml_backend_cuda_get_device_description(int device, char * description, size_t description_size) {
+ ggml_cuda_get_device_description(device, description, description_size);
+}
+
+void ggml_backend_cuda_get_device_memory(int device, size_t * free, size_t * total) {
+ ggml_cuda_set_device(device);
+
+ CUDA_CHECK(cudaMemGetInfo(free, total));
}
+// backend registry
static ggml_backend_t ggml_backend_reg_cuda_init(const char * params, void * user_data) {
ggml_backend_t cuda_backend = ggml_backend_cuda_init((int) (intptr_t) user_data);
return cuda_backend;
#define LLAMA_API_INTERNAL
-//#define LLAMA_GGML_BACKEND_CUDA_TEST // for testing only - enables ggml-cuda through ggml-backend, disables partial offloading
#include "llama.h"
#include "unicode.h"
return std::fabs(b - a) <= abs_tol;
}
-#ifdef GGML_USE_CPU_HBM
-#include <hbwmalloc.h>
-#endif
-
static void zeros(std::ofstream & file, size_t n) {
char zero = 0;
for (size_t i = 0; i < n; ++i) {
#endif
};
-typedef void (*offload_func_t)(struct ggml_tensor * tensor);
-
-static void ggml_offload_nop(struct ggml_tensor * tensor) {
- (void) tensor;
-}
-
static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
std::vector<char> result(8, 0);
const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
return std::string(result.data(), result.size());
}
-static ggml_backend_buffer_type_t llama_default_buffer_type(int n_gpu_layers) {
+static ggml_backend_buffer_type_t llama_default_buffer_type_cpu(bool host_buffer) {
ggml_backend_buffer_type_t buft = nullptr;
-#ifdef GGML_USE_METAL
- if (n_gpu_layers > 0) {
- buft = ggml_backend_metal_buffer_type();
- }
-#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
- if (n_gpu_layers > 0) {
- buft = ggml_backend_cuda_buffer_type(0);
+#if defined(GGML_USE_CUBLAS)
+ // host buffers should only be used when data is expected to be copied to/from the GPU
+ if (host_buffer) {
+ buft = ggml_backend_cuda_host_buffer_type();
}
-#elif defined(GGML_USE_CUBLAS)
- buft = ggml_backend_cuda_host_buffer_type();
#elif defined(GGML_USE_CPU_HBM)
buft = ggml_backend_cpu_hbm_buffer_type();
#endif
if (buft == nullptr) {
buft = ggml_backend_cpu_buffer_type();
}
+ return buft;
+
+ GGML_UNUSED(host_buffer);
+}
+
+static ggml_backend_buffer_type_t llama_default_buffer_type_offload(int gpu) {
+ ggml_backend_buffer_type_t buft = nullptr;
+
+#ifdef GGML_USE_METAL
+ buft = ggml_backend_metal_buffer_type();
+#elif defined(GGML_USE_CUBLAS)
+ buft = ggml_backend_cuda_buffer_type(gpu);
+#elif defined(GGML_USE_CLBLAST)
+ buft = ggml_backend_opencl_buffer_type();
+#endif
+
+ if (buft == nullptr) {
+ buft = llama_default_buffer_type_cpu(true);
+ }
+ return buft;
+
+ GGML_UNUSED(gpu);
+}
+
+static ggml_backend_buffer_type_t llama_default_buffer_type_split(int fallback_gpu, const float * tensor_split) {
+ ggml_backend_buffer_type_t buft = nullptr;
+
+#ifdef GGML_USE_CUBLAS
+ if (ggml_backend_cuda_get_device_count() > 1) {
+ buft = ggml_backend_cuda_split_buffer_type(tensor_split);
+ }
+#endif
+ if (buft == nullptr) {
+ buft = llama_default_buffer_type_offload(fallback_gpu);
+ }
return buft;
- GGML_UNUSED(n_gpu_layers);
+ GGML_UNUSED(tensor_split);
}
//
std::vector<struct ggml_tensor *> k_l; // per layer
std::vector<struct ggml_tensor *> v_l;
- struct ggml_context * ctx = NULL;
+ std::vector<struct ggml_context *> ctxs;
+ std::vector<ggml_backend_buffer_t> bufs;
- ggml_backend_buffer_t buf = NULL;
+ size_t total_size() const {
+ size_t size = 0;
+ for (ggml_backend_buffer_t buf : bufs) {
+ size += ggml_backend_buffer_get_size(buf);
+ }
+ return size;
+ }
~llama_kv_cache() {
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
- if (ggml_cublas_loaded()) {
- for (size_t i = 0; i < k_l.size(); ++i) {
- ggml_cuda_free_data(k_l[i]);
- ggml_cuda_free_data(v_l[i]);
- }
- }
-#endif
- if (ctx) {
+ for (struct ggml_context * ctx : ctxs) {
ggml_free(ctx);
}
-
- ggml_backend_buffer_free(buf);
+ for (ggml_backend_buffer_t buf : bufs) {
+ ggml_backend_buffer_free(buf);
+ }
}
};
std::vector<llama_layer> layers;
+ llama_split_mode split_mode;
+ int main_gpu;
int n_gpu_layers;
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
- // context
- struct ggml_context * ctx = NULL;
+ // layer -> buffer type mapping
+ struct layer_buft {
+ layer_buft() : buft_matrix(nullptr), buft(nullptr) {}
+ layer_buft(ggml_backend_buffer_type_t matrix) : buft_matrix(matrix), buft(matrix) {}
+ layer_buft(ggml_backend_buffer_type_t matrix, ggml_backend_buffer_type_t other) : buft_matrix(matrix), buft(other) {}
+
+ ggml_backend_buffer_type_t buft_matrix; // matrices only - used by split buffers and backends that support only matrix multiplication
+ ggml_backend_buffer_type_t buft; // everything else
+ };
+
+ layer_buft buft_input;
+ layer_buft buft_output;
+ std::vector<layer_buft> buft_layer;
+
+ // contexts where the model tensors metadata is stored
+ std::vector<struct ggml_context *> ctxs;
- // the model memory buffer
- ggml_backend_buffer_t buf = NULL;
+ // the model memory buffers for the tensor data
+ std::vector<ggml_backend_buffer_t> bufs;
// model memory mapped file
std::unique_ptr<llama_mmap> mapping;
int64_t t_start_us = 0;
~llama_model() {
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
- if (ggml_cublas_loaded()) {
- for (size_t i = 0; i < tensors_by_name.size(); ++i) {
- ggml_cuda_free_data(tensors_by_name[i].second);
- }
- ggml_cuda_free_scratch();
- }
-#endif
-
-#if defined(GGML_USE_CLBLAST)
- for (size_t i = 0; i < tensors_by_name.size(); ++i) {
- ggml_cl_free_data(tensors_by_name[i].second);
- }
-#endif
- if (ctx) {
+ for (struct ggml_context * ctx : ctxs) {
ggml_free(ctx);
}
-
- ggml_backend_buffer_free(buf);
+ for (ggml_backend_buffer_t buf : bufs) {
+ ggml_backend_buffer_free(buf);
+ }
}
};
struct llama_context {
llama_context(const llama_model & model) : model(model), t_start_us(model.t_start_us), t_load_us(model.t_load_us) {}
~llama_context() {
- ggml_allocr_free(alloc);
- ggml_backend_buffer_free(buf_alloc);
- ggml_backend_free(backend);
+ ggml_backend_sched_free(sched);
+
+ for (ggml_backend_t backend : backends) {
+ ggml_backend_free(backend);
+ }
}
llama_cparams cparams;
- ggml_backend_t backend = nullptr;
+ std::vector<ggml_backend_t> backends;
+#ifdef GGML_USE_METAL
+ ggml_backend_t backend_metal = nullptr;
+#endif
+ ggml_backend_t backend_cpu = nullptr;
const llama_model & model;
// memory buffers used to evaluate the model
std::vector<uint8_t> buf_compute_meta;
- ggml_backend_buffer_t buf_alloc = NULL;
- ggml_allocr * alloc = NULL;
+ ggml_backend_sched_t sched = nullptr;
+ // allocator for the input tensors
+ ggml_tallocr * alloc = nullptr;
// temporary buffer for copying data to/from the backend
std::vector<no_init<uint8_t>> buf_copy;
//
static bool llama_kv_cache_init(
- const struct llama_hparams & hparams,
struct llama_kv_cache & cache,
+ const llama_model & model,
ggml_type ktype,
ggml_type vtype,
uint32_t n_ctx,
- int n_gpu_layers,
bool offload) {
+ const struct llama_hparams & hparams = model.hparams;
+
const uint32_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const uint32_t n_embd_v_gqa = hparams.n_embd_v_gqa();
- const uint32_t n_layer = hparams.n_layer;
+ const int64_t n_layer = hparams.n_layer;
cache.has_shift = false;
cache.cells.clear();
cache.cells.resize(n_ctx);
- struct ggml_init_params params;
- params.mem_size = 2u*n_layer*ggml_tensor_overhead();
- params.mem_buffer = NULL;
- params.no_alloc = true;
-
- cache.ctx = ggml_init(params);
+#ifdef GGML_USE_CLBLAST
+ offload = false;
+#endif
- size_t vram_kv_cache = 0;
+ // count used buffer types
+ std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
+ if (offload) {
+ for (int64_t i = 0; i < n_layer; ++i) {
+ buft_layer_count[model.buft_layer[i].buft]++;
+ }
+ } else {
+ buft_layer_count[llama_default_buffer_type_cpu(true)] = n_layer;
+ }
- if (!cache.ctx) {
- LLAMA_LOG_ERROR("%s: failed to allocate memory for kv cache\n", __func__);
- return false;
+ // create a context for each buffer type
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+ for (auto & it : buft_layer_count) {
+ int n_layers = it.second;
+ struct ggml_init_params params = {
+ /*.mem_size =*/ 2u*n_layers*ggml_tensor_overhead(),
+ /*.mem_buffer =*/ NULL,
+ /*.no_alloc =*/ true,
+ };
+ ggml_context * ctx = ggml_init(params);
+ if (!ctx) {
+ LLAMA_LOG_ERROR("%s: failed to allocate context for kv cache\n", __func__);
+ return false;
+ }
+ ctx_map[it.first] = ctx;
+ cache.ctxs.push_back(ctx);
}
cache.k_l.reserve(n_layer);
cache.v_l.reserve(n_layer);
- const int i_gpu_start = (int) n_layer - n_gpu_layers;
-
for (int i = 0; i < (int) n_layer; i++) {
- ggml_tensor * k = ggml_new_tensor_1d(cache.ctx, ktype, n_embd_k_gqa*n_ctx);
- ggml_tensor * v = ggml_new_tensor_1d(cache.ctx, vtype, n_embd_v_gqa*n_ctx);
+ struct ggml_context * ctx = offload ? ctx_map.at(model.buft_layer[i].buft) : cache.ctxs.front();
+ ggml_tensor * k = ggml_new_tensor_1d(ctx, ktype, n_embd_k_gqa*n_ctx);
+ ggml_tensor * v = ggml_new_tensor_1d(ctx, vtype, n_embd_v_gqa*n_ctx);
ggml_format_name(k, "cache_k_l%d", i);
ggml_format_name(v, "cache_v_l%d", i);
cache.k_l.push_back(k);
cache.v_l.push_back(v);
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
- if (i >= i_gpu_start) {
- if (offload) {
- ggml_cuda_assign_buffers_no_scratch(k);
- ggml_cuda_assign_buffers_no_scratch(v);
- vram_kv_cache += ggml_nbytes(k);
- vram_kv_cache += ggml_nbytes(v);
- // HACK: mark tensor as allocated
- k->data = v->data = (void *)(uintptr_t)1;
- }
- }
-#endif // GGML_USE_CUBLAS
}
- // allocate tensors
- cache.buf = ggml_backend_alloc_ctx_tensors_from_buft(cache.ctx, llama_default_buffer_type(n_gpu_layers));
-
- // buf may be NULL with full offload
- if (cache.buf) {
- // initialize the buffer to avoid NaNs in the padding
- ggml_backend_buffer_clear(cache.buf, 0);
- }
-
- if (vram_kv_cache > 0) {
- LLAMA_LOG_INFO("%s: VRAM kv self = %.2f MB\n", __func__, vram_kv_cache / 1024.0 / 1024.0);
+ // allocate tensors and initialize the buffers to avoid NaNs in the padding
+ for (auto it : ctx_map) {
+ ggml_backend_buffer_type_t buft = it.first;
+ ggml_context * ctx = it.second;
+ ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+ if (!buf) {
+ LLAMA_LOG_ERROR("%s: failed to allocate buffer for kv cache\n", __func__);
+ return false;
+ }
+ ggml_backend_buffer_clear(buf, 0);
+ LLAMA_LOG_INFO("%s: %10s KV buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf)/1024.0/1024.0);
+ cache.bufs.push_back(buf);
}
- GGML_UNUSED(i_gpu_start);
- GGML_UNUSED(offload);
-
return true;
}
return get_tensor_meta(get_tensor_name(i));
}
- struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta, ggml_backend_type backend) {
+ struct ggml_tensor * create_tensor_for(struct ggml_context * ctx, struct ggml_tensor * meta) {
struct ggml_tensor * tensor = ggml_dup_tensor(ctx, meta);
- tensor->backend = backend; // TODO: ggml_set_backend
ggml_set_name(tensor, ggml_get_name(meta));
n_created++;
return tensor;
}
- struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, ggml_backend_type backend, bool required = true) {
+ struct ggml_tensor * create_tensor(struct ggml_context * ctx, const std::string & name, const std::vector<int64_t> & ne, bool required = true) {
struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, name.c_str());
if (cur == NULL) {
throw std::runtime_error(format("%s: tensor '%s' not found", __func__, name.c_str()));
}
- if (backend == GGML_BACKEND_GPU_SPLIT) {
- if (ne.size() == 1) {
- throw std::runtime_error(format("%s: 1-dimensional tensor '%s' cannot be split on the GPU", __func__, name.c_str()));
- }
- }
-
{
bool is_ok = true;
for (size_t i = 0; i < ne.size(); ++i) {
}
}
- return create_tensor_for(ctx, cur, backend);
+ return create_tensor_for(ctx, cur);
}
void done_getting_tensors() const {
return gguf_get_data_offset(ctx_gguf) + gguf_get_tensor_offset(ctx_gguf, idx);
}
- void init_mapping(bool prefetch = true) {
- /*
- // prefetch only CPU tensors
+ void init_mapping(bool prefetch = true, llama_mlock * lmlock = nullptr) {
+ // prefetch the whole file - all the data is needed anyway
if (use_mmap) {
- size_t size_pref = 0; // prefetch
+ mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
+ }
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
- struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
- if (cur->backend == GGML_BACKEND_CPU) {
- size_t tensor_end = gguf_get_tensor_offset(ctx_gguf, i) + ggml_nbytes(cur);
- size_pref = std::max(size_pref, tensor_end);
- }
+ // compute the total size of all tensors for progress reporting
+ for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
+ struct ggml_tensor * cur = ggml_get_tensor(ctx_meta, gguf_get_tensor_name(ctx_gguf, i));
+ size_data += ggml_nbytes(cur);
+ }
+
+ if (use_mmap && mapping) {
+ if (lmlock) {
+ lmlock->init(mapping->addr);
}
- mapping.reset(new llama_mmap(&file, gguf_get_data_offset(ctx_gguf) + size_pref, ggml_is_numa()));
+ mmap_used_first = mapping->size;
}
- */
- // prefetch the whole file - all the data is needed anyway
- if (use_mmap) {
- mapping.reset(new llama_mmap(&file, prefetch ? -1 : 0, ggml_is_numa()));
+ }
+
+ void get_mapping_range(size_t * first, size_t * last, ggml_context * ctx) const {
+ GGML_ASSERT(mapping);
+
+ *first = mapping->size;
+ *last = 0;
+ for (ggml_tensor * tensor = ggml_get_first_tensor(ctx); tensor; tensor = ggml_get_next_tensor(ctx, tensor)) {
+ const size_t offs = file_offset(ggml_get_name(tensor));
+ *first = std::min(*first, offs);
+ *last = std::max(*last, offs + ggml_nbytes(tensor));
}
}
const size_t offs = file_offset(ggml_get_name(cur));
if (use_mmap && mapping) {
- GGML_ASSERT(cur->data == nullptr);
- cur->data = (uint8_t *)mapping->addr + offs;
+ if (cur->data == nullptr) {
+ cur->data = (uint8_t *)mapping->addr + offs;
+ } else {
+ memcpy(cur->data, (uint8_t *)mapping->addr + offs, ggml_nbytes(cur));
+ }
} else {
GGML_ASSERT(cur->data != nullptr);
file.seek(offs, SEEK_SET);
}
}
- // Returns false if cancelled by progress_callback
- bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) const {
- size_t size_data = 0;
-
- for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
- struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
- size_data += ggml_nbytes(cur);
- }
-
- if (use_mmap && buf_mmap) {
- if (lmlock) {
- lmlock->init(mapping->addr);
- }
- }
+ size_t size_done = 0;
+ size_t size_data = 0;
+ size_t mmap_used_first = -1;
+ size_t mmap_used_last = 0;
-#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
- const bool legacy_offload = true;
-#else
- const bool legacy_offload = false;
-#endif
+ // Returns false if cancelled by progress_callback
+ bool load_all_data(struct ggml_context * ctx, llama_progress_callback progress_callback, void * progress_callback_user_data, ggml_backend_buffer_t buf_mmap, llama_mlock * lmlock) {
+ GGML_ASSERT(size_data != 0 && "call init_mapping() first");
std::vector<no_init<uint8_t>> read_buf;
- size_t size_done = 0;
-
- size_t mmap_first = -1;
- size_t mmap_last = 0;
-
for (int i = 0; i < gguf_get_n_tensors(ctx_gguf); i++) {
struct ggml_tensor * cur = ggml_get_tensor(ctx, gguf_get_tensor_name(ctx_gguf, i));
- GGML_ASSERT(cur); // unused tensors should have been caught by load_data already
+ if (!cur) {
+ // some tensors may be allocated in a different context
+ continue;
+ }
if (progress_callback) {
if (!progress_callback((float) size_done / size_data, progress_callback_user_data)) {
const size_t offs = file_offset(ggml_get_name(cur));
- if (!legacy_offload || cur->backend == GGML_BACKEND_CPU) {
- if (use_mmap && mapping) {
- if (buf_mmap) {
- ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
- if (lmlock) {
- lmlock->grow_to(offs + ggml_nbytes(cur));
- }
- mmap_first = std::min(mmap_first, offs);
- mmap_last = std::max(mmap_last, offs + ggml_nbytes(cur));
- } else {
- ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
+ if (use_mmap && mapping) {
+ if (buf_mmap && cur->data == nullptr) {
+ ggml_backend_tensor_alloc(buf_mmap, cur, (uint8_t *) mapping->addr + offs);
+ if (lmlock) {
+ lmlock->grow_to(offs + ggml_nbytes(cur));
}
+ mmap_used_first = std::min(mmap_used_first, offs);
+ mmap_used_last = std::max(mmap_used_last, offs + ggml_nbytes(cur));
} else {
- if (ggml_backend_buffer_is_host(cur->buffer)) {
- file.seek(offs, SEEK_SET);
- file.read_raw(cur->data, ggml_nbytes(cur));
- } else {
- read_buf.resize(ggml_nbytes(cur));
- file.seek(offs, SEEK_SET);
- file.read_raw(read_buf.data(), ggml_nbytes(cur));
- ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
- }
+ ggml_backend_tensor_set(cur, (uint8_t *) mapping->addr + offs, 0, ggml_nbytes(cur));
}
} else {
- // HACK: mark tensor as allocated
- cur->data = (void *)(uintptr_t)1;
- void * data;
- if (use_mmap && mapping) {
- data = (uint8_t *) mapping->addr + offs;
+ if (ggml_backend_buffer_is_host(cur->buffer)) {
+ file.seek(offs, SEEK_SET);
+ file.read_raw(cur->data, ggml_nbytes(cur));
} else {
read_buf.resize(ggml_nbytes(cur));
file.seek(offs, SEEK_SET);
file.read_raw(read_buf.data(), ggml_nbytes(cur));
- data = read_buf.data();
+ ggml_backend_tensor_set(cur, read_buf.data(), 0, ggml_nbytes(cur));
}
-
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
- ggml_cuda_transform_tensor(data, cur);
-#elif defined(GGML_USE_CLBLAST)
- GGML_ASSERT(cur->backend == GGML_BACKEND_GPU);
- ggml_cl_transform_tensor(data, cur);
-#else
- GGML_ASSERT(!"GPU tensor without a GPU backend");
- GGML_UNUSED(data);
-#endif
}
size_done += ggml_nbytes(cur);
}
- // unmap offloaded tensors and metadata
- if (use_mmap && mapping) {
- mapping->unmap_fragment(0, mmap_first);
- mapping->unmap_fragment(mmap_last, mapping->size);
+ // check if this is the last call and do final cleanup
+ if (size_done >= size_data) {
+ // unmap offloaded tensors and metadata
+ if (use_mmap && mapping) {
+ mapping->unmap_fragment(0, mmap_used_first);
+ if (mmap_used_last != 0) {
+ mapping->unmap_fragment(mmap_used_last, mapping->size);
+ }
+ }
+ if (progress_callback) {
+ // Even though the model is done loading, we still honor
+ // cancellation since we need to free allocations.
+ return progress_callback(1.0f, progress_callback_user_data);
+ }
}
- if (progress_callback) {
- // Even though the model is done loading, we still honor
- // cancellation since we need to free allocations.
- return progress_callback(1.0f, progress_callback_user_data);
- }
return true;
}
};
llama_model_loader & ml,
llama_model & model,
int n_gpu_layers,
+ enum llama_split_mode split_mode,
int main_gpu,
const float * tensor_split,
bool use_mlock,
void * progress_callback_user_data) {
model.t_start_us = ggml_time_us();
- auto & ctx = model.ctx;
auto & hparams = model.hparams;
+ model.split_mode = split_mode;
+ model.main_gpu = main_gpu;
model.n_gpu_layers = n_gpu_layers;
- size_t ctx_size = ggml_tensor_overhead() * ml.n_tensors;
+ const int64_t n_layer = hparams.n_layer;
+ const int64_t i_gpu_start = std::max((int64_t) hparams.n_layer - n_gpu_layers, (int64_t) 0);
+
+ // there is very little benefit to offloading the input layer, so always keep it on the CPU
+ model.buft_input = llama_default_buffer_type_cpu(true);
- LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, ctx_size/1024.0/1024.0);
+ model.buft_layer.resize(n_layer);
- // create the ggml context
+ // assign cpu layers
+ for (int64_t i = 0; i < i_gpu_start; ++i) {
+ model.buft_layer[i] = llama_default_buffer_type_cpu(true);
+ }
+
+#ifdef GGML_USE_CUBLAS
+ if (split_mode == LLAMA_SPLIT_LAYER) {
+ // calculate the split points
+ int device_count = ggml_backend_cuda_get_device_count();
+ bool all_zero = tensor_split == nullptr || std::all_of(tensor_split, tensor_split + device_count, [](float x) { return x == 0.0f; });
+ float splits[GGML_CUDA_MAX_DEVICES];
+ if (all_zero) {
+ // default split, by free memory
+ for (int i = 0; i < device_count; ++i) {
+ size_t total;
+ size_t free;
+ ggml_backend_cuda_get_device_memory(i, &total, &free);
+ splits[i] = free;
+ }
+ } else {
+ std::copy(tensor_split, tensor_split + device_count, splits);
+ }
+
+ // sum and normalize the splits to get the split points
+ float split_sum = 0.0f;
+ for (int i = 0; i < device_count; ++i) {
+ split_sum += splits[i];
+ splits[i] = split_sum;
+ }
+ for (int i = 0; i < device_count; ++i) {
+ splits[i] /= split_sum;
+ }
+
+ // assign the repeating layers to the devices according to the splits
+ int act_gpu_layers = std::min(n_gpu_layers, (int)n_layer + 1);
+ for (int64_t i = i_gpu_start; i < n_layer; ++i) {
+ int layer_gpu = std::upper_bound(splits, splits + device_count, float(i - i_gpu_start)/act_gpu_layers) - splits;
+ model.buft_layer[i] = llama_default_buffer_type_offload(layer_gpu);
+ }
+ // assign the output layer
+ if (n_gpu_layers > n_layer) {
+ int layer_gpu = std::upper_bound(splits, splits + device_count, float(act_gpu_layers - 1)/act_gpu_layers) - splits;
+ model.buft_output = llama_default_buffer_type_offload(layer_gpu);
+ } else {
+ model.buft_output = llama_default_buffer_type_cpu(true);
+ }
+ } else
+#endif
{
+ ggml_backend_buffer_type_t split_buft;
+ if (split_mode == LLAMA_SPLIT_ROW) {
+ split_buft = llama_default_buffer_type_split(main_gpu, tensor_split);
+ } else {
+ // LLAMA_SPLIT_NONE or LLAMA_SPLIT_LAYER in backends where it is not supported
+ split_buft = llama_default_buffer_type_offload(main_gpu);
+ }
+ // assign the repeating layers
+ for (int64_t i = i_gpu_start; i < n_layer; ++i) {
+ model.buft_layer[i] = {
+ split_buft,
+ llama_default_buffer_type_offload(main_gpu)
+ };
+ }
+ // assign the output layer
+ if (n_gpu_layers > n_layer) {
+ model.buft_output = {
+ split_buft,
+ llama_default_buffer_type_offload(main_gpu)
+ };
+ } else {
+ model.buft_output = llama_default_buffer_type_cpu(true);
+ }
+ }
+
+ // count used buffer types
+ std::map<ggml_backend_buffer_type_t, int> buft_layer_count;
+ buft_layer_count[model.buft_input.buft]++;
+ buft_layer_count[model.buft_input.buft_matrix]++;
+ buft_layer_count[model.buft_output.buft]++;
+ buft_layer_count[model.buft_output.buft_matrix]++;
+ for (int64_t i = 0; i < n_layer; ++i) {
+ buft_layer_count[model.buft_layer[i].buft]++;
+ buft_layer_count[model.buft_layer[i].buft_matrix]++;
+ }
+
+ // create one context per buffer type
+ size_t ctx_size = ggml_tensor_overhead()*ml.n_tensors;
+ std::map<ggml_backend_buffer_type_t, ggml_context *> ctx_map;
+ for (auto & it : buft_layer_count) {
struct ggml_init_params params = {
/*.mem_size =*/ ctx_size,
/*.mem_buffer =*/ NULL,
/*.no_alloc =*/ true,
};
-
- model.ctx = ggml_init(params);
- if (!model.ctx) {
- throw std::runtime_error(format("ggml_init() failed"));
+ ggml_context * ctx = ggml_init(params);
+ if (!ctx) {
+ throw std::runtime_error(format("failed to create context"));
}
+ ctx_map[it.first] = ctx;
+ model.ctxs.push_back(ctx);
}
- (void) main_gpu;
-
- enum ggml_backend_type llama_backend_offload = GGML_BACKEND_CPU;
- enum ggml_backend_type llama_backend_offload_split = GGML_BACKEND_CPU;
-
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
- if (ggml_cublas_loaded()) {
- LLAMA_LOG_INFO("%s: using " GGML_CUDA_NAME " for GPU acceleration\n", __func__);
- ggml_cuda_set_main_device(main_gpu);
-
- llama_backend_offload = GGML_BACKEND_GPU;
- llama_backend_offload_split = GGML_BACKEND_GPU_SPLIT;
- }
-#elif defined(GGML_USE_CLBLAST)
- LLAMA_LOG_INFO("%s: using OpenCL for GPU acceleration\n", __func__);
- llama_backend_offload = GGML_BACKEND_GPU;
- llama_backend_offload_split = GGML_BACKEND_GPU;
-#endif
+ LLAMA_LOG_INFO("%s: ggml ctx size = %7.2f MiB\n", __func__, model.ctxs.size()*ctx_size/1024.0/1024.0);
// create tensors for the weights
{
const int64_t n_embd = hparams.n_embd;
const int64_t n_embd_k_gqa = hparams.n_embd_k_gqa();
const int64_t n_embd_v_gqa = hparams.n_embd_v_gqa();
- const int64_t n_layer = hparams.n_layer;
+ const int64_t n_embd_gqa = n_embd_v_gqa;
const int64_t n_vocab = hparams.n_vocab;
+ const int64_t n_ff = hparams.n_ff;
+
+ GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
+
+ ggml_context * ctx_input = ctx_map.at(model.buft_input.buft);
+ ggml_context * ctx_output = ctx_map.at(model.buft_output.buft);
+ ggml_context * ctx_output_split = ctx_map.at(model.buft_output.buft_matrix);
+ auto ctx_for_layer = [&](int i) { return ctx_map.at(model.buft_layer[i].buft); };
+ auto ctx_for_layer_split = [&](int i) { return ctx_map.at(model.buft_layer[i].buft_matrix); };
+
+ model.layers.resize(n_layer);
const auto tn = LLM_TN(model.arch);
switch (model.arch) {
case LLM_ARCH_LLAMA:
case LLM_ARCH_REFACT:
{
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
// output
{
- ggml_backend_type backend_norm;
- ggml_backend_type backend_output;
-
- if (n_gpu_layers > int(n_layer)) {
- backend_norm = llama_backend_offload;
- backend_output = llama_backend_offload_split;
- } else {
- backend_norm = GGML_BACKEND_CPU;
- backend_output = GGML_BACKEND_CPU;
- }
-
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
}
- const uint32_t n_ff = hparams.n_ff;
- const int64_t n_embd_gqa = n_embd_v_gqa;
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
- const int i_gpu_start = n_layer - n_gpu_layers;
-
- model.layers.resize(n_layer);
-
- for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
- layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
- layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
- layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
// optional bias tensors
- layer.bq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, backend, false);
- layer.bk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, backend, false);
- layer.bv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, backend, false);
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend, false);
+ layer.bq = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q, "bias", i), {n_embd}, false);
+ layer.bk = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K, "bias", i), {n_embd_gqa}, false);
+ layer.bv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_V, "bias", i), {n_embd_gqa}, false);
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, false);
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
- layer.ffn_gate_inp = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, backend, false);
+ layer.ffn_gate_inp = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_GATE_INP, "weight", i), {n_embd}, false);
if (layer.ffn_gate_inp == nullptr) {
GGML_ASSERT(hparams.n_expert == 0);
GGML_ASSERT(hparams.n_expert_used == 0);
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
} else {
GGML_ASSERT(hparams.n_expert > 0);
GGML_ASSERT(hparams.n_expert_used > 0);
// MoE branch
for (uint32_t x = 0; x < hparams.n_expert; ++x) {
- layer.ffn_gate_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
- layer.ffn_down_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd}, backend_split);
- layer.ffn_up_exp[x] = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff}, backend_split);
+ layer.ffn_gate_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE_EXP, "weight", i, x), {n_embd, n_ff});
+ layer.ffn_down_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN_EXP, "weight", i, x), { n_ff, n_embd});
+ layer.ffn_up_exp[x] = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP_EXP, "weight", i, x), {n_embd, n_ff});
}
}
}
} break;
case LLM_ARCH_BAICHUAN:
{
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
{
- ggml_backend_type backend_norm;
- ggml_backend_type backend_output;
-
- if (n_gpu_layers > int(n_layer)) {
- backend_norm = llama_backend_offload;
- backend_output = llama_backend_offload_split;
- } else {
- backend_norm = GGML_BACKEND_CPU;
- backend_output = GGML_BACKEND_CPU;
- }
-
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
}
- const uint32_t n_ff = hparams.n_ff;
- const int64_t n_embd_gqa = n_embd_v_gqa;
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
- const int i_gpu_start = n_layer - n_gpu_layers;
-
- model.layers.resize(n_layer);
-
- for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
- layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
- layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
- layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
}
} break;
case LLM_ARCH_FALCON:
{
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
// output
{
- ggml_backend_type backend_norm;
- ggml_backend_type backend_output;
-
- if (n_gpu_layers > int(n_layer)) {
- backend_norm = llama_backend_offload;
- backend_output = llama_backend_offload_split;
- } else {
- backend_norm = GGML_BACKEND_CPU;
- backend_output = GGML_BACKEND_CPU;
- }
-
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
}
- const uint32_t n_ff = hparams.n_ff;
- const int64_t n_embd_gqa = n_embd_v_gqa;
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
- const int i_gpu_start = n_layer - n_gpu_layers;
-
- model.layers.resize(n_layer);
-
- for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
if (gguf_find_tensor(ml.ctx_gguf, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i).c_str()) >= 0) {
- layer.attn_norm_2 = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd}, backend);
- layer.attn_norm_2_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd}, backend);
+ layer.attn_norm_2 = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "weight", i), {n_embd});
+ layer.attn_norm_2_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM_2, "bias", i), {n_embd});
}
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
}
} break;
case LLM_ARCH_STARCODER:
{
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
- model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
// output
{
- ggml_backend_type backend_norm;
- ggml_backend_type backend_output;
-
- if (n_gpu_layers > int(n_layer)) {
- backend_norm = llama_backend_offload;
- backend_output = llama_backend_offload_split;
- } else {
- backend_norm = GGML_BACKEND_CPU;
- backend_output = GGML_BACKEND_CPU;
- }
-
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
}
- const uint32_t n_ff = hparams.n_ff;
- const int64_t n_embd_gqa = n_embd_v_gqa;
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
- const int i_gpu_start = n_layer - n_gpu_layers;
-
- model.layers.resize(n_layer);
-
- for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
- layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
- layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
}
} break;
case LLM_ARCH_PERSIMMON:
{
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
{
- ggml_backend_type backend_norm;
- ggml_backend_type backend_output;
-
- if (n_gpu_layers > int(n_layer)) {
- backend_norm = llama_backend_offload;
- backend_output = llama_backend_offload_split;
- } else {
- backend_norm = GGML_BACKEND_CPU;
- backend_output = GGML_BACKEND_CPU;
- }
-
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
}
- const uint32_t n_ff = hparams.n_ff;
- const int64_t n_embd_gqa = n_embd_v_gqa;
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
- const int i_gpu_start = n_layer - n_gpu_layers;
- model.layers.resize(n_layer);
- for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload;
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split;
auto & layer = model.layers[i];
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
- layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
- layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
- layer.attn_q_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64}, backend);
- layer.attn_q_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64}, backend);
- layer.attn_k_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64}, backend);
- layer.attn_k_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64}, backend);
+
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
+
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
+
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
+
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
+
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
+
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
+
+ layer.attn_q_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "weight", i), {64});
+ layer.attn_q_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_Q_NORM, "bias", i), {64});
+
+ layer.attn_k_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "weight", i), {64});
+ layer.attn_k_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_K_NORM, "bias", i), {64});
}
} break;
case LLM_ARCH_BLOOM:
{
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
- model.tok_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd}, GGML_BACKEND_CPU);
- model.tok_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+ model.tok_norm = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "weight"), {n_embd});
+ model.tok_norm_b = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD_NORM, "bias"), {n_embd});
// output
{
- ggml_backend_type backend_norm;
- ggml_backend_type backend_output;
-
- if (n_gpu_layers > int(n_layer)) {
- backend_norm = llama_backend_offload;
- backend_output = llama_backend_offload_split;
- } else {
- backend_norm = GGML_BACKEND_CPU;
- backend_output = GGML_BACKEND_CPU;
- }
-
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
}
- const uint32_t n_ff = hparams.n_ff;
- const int64_t n_embd_gqa = n_embd_v_gqa;
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
- const int i_gpu_start = n_layer - n_gpu_layers;
-
- model.layers.resize(n_layer);
-
- for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
- layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
- layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
}
} break;
case LLM_ARCH_MPT:
{
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+
// output
{
- ggml_backend_type backend_norm;
- ggml_backend_type backend_output;
-
- if (n_gpu_layers > int(n_layer)) {
- backend_norm = llama_backend_offload;
- backend_output = llama_backend_offload_split;
- } else {
- backend_norm = GGML_BACKEND_CPU;
- backend_output = GGML_BACKEND_CPU;
- }
-
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
}
- const uint32_t n_ff = hparams.n_ff;
- const int64_t n_embd_gqa = n_embd_v_gqa;
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
- const int i_gpu_start = n_layer - n_gpu_layers;
-
- model.layers.resize(n_layer);
-
- for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
// AWQ ScaleActivation layer
- layer.ffn_act = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, backend, false);
+ layer.ffn_act = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_ACT, "scales", i), {n_ff}, false);
}
} break;
case LLM_ARCH_STABLELM:
{
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
// output
{
- ggml_backend_type backend_norm;
- ggml_backend_type backend_output;
-
- if (n_gpu_layers > int(n_layer)) {
- backend_norm = llama_backend_offload;
- backend_output = llama_backend_offload_split;
- } else {
- backend_norm = GGML_BACKEND_CPU;
- backend_output = GGML_BACKEND_CPU;
- }
-
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
}
- const uint32_t n_ff = hparams.n_ff;
- const int64_t n_embd_gqa = n_embd_v_gqa;
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
- const int i_gpu_start = n_layer - n_gpu_layers;
-
- model.layers.resize(n_layer);
-
- for (uint32_t i = 0; i < n_layer; ++i) {
- /*
- llama_model_loader: - tensor 4: blk.0.attn_output.weight f16 [ 2560, 2560, 1, 1 ]
- */
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
- layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
- layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
- layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
}
} break;
case LLM_ARCH_QWEN:
{
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
- {
- ggml_backend_type backend_norm;
- ggml_backend_type backend_output;
-
- if (n_gpu_layers > int(n_layer)) {
- backend_norm = llama_backend_offload;
- backend_output = llama_backend_offload_split;
- } else {
- backend_norm = GGML_BACKEND_CPU;
- backend_output = GGML_BACKEND_CPU;
- }
-
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
- }
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
- const uint32_t n_ff = hparams.n_ff / 2;
-
- const int i_gpu_start = n_layer - n_gpu_layers;
-
- model.layers.resize(n_layer);
+ // output
+ {
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
+ }
- for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd * 3}, backend_split);
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd * 3}, backend);
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd*3});
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd*3});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff/2});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff/2, n_embd});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff/2});
}
} break;
case LLM_ARCH_PHI2:
{
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
// output
{
- ggml_backend_type backend_norm;
- ggml_backend_type backend_output;
-
- if (n_gpu_layers > int(n_layer)) {
- backend_norm = llama_backend_offload;
- backend_output = llama_backend_offload;
- } else {
- backend_norm = GGML_BACKEND_CPU;
- backend_output = GGML_BACKEND_CPU;
- }
-
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
- model.output_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab}, backend_output);
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
+ model.output_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT, "bias"), {n_vocab});
}
- const uint32_t n_ff = hparams.n_ff;
- const int64_t n_embd_gqa = n_embd_v_gqa;
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
- const int i_gpu_start = n_layer - n_gpu_layers;
-
- model.layers.resize(n_layer);
-
- for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
- layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
- layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
}
} break;
case LLM_ARCH_PLAMO:
{
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
// output
{
- ggml_backend_type backend_norm;
- ggml_backend_type backend_output;
-
- if (n_gpu_layers > int(n_layer)) {
- backend_norm = llama_backend_offload;
- backend_output = llama_backend_offload_split;
- } else {
- backend_norm = GGML_BACKEND_CPU;
- backend_output = GGML_BACKEND_CPU;
- }
-
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
}
- const uint32_t n_ff = hparams.n_ff;
- const int64_t n_embd_gqa = n_embd_v_gqa;
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
- const int i_gpu_start = n_layer - n_gpu_layers;
-
- model.layers.resize(n_layer);
-
- for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
- layer.wq = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd}, backend_split);
- layer.wk = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa}, backend_split);
- layer.wv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa}, backend_split);
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
+ layer.wq = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_Q, "weight", i), {n_embd, n_embd});
+ layer.wk = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_K, "weight", i), {n_embd, n_embd_gqa});
+ layer.wv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_V, "weight", i), {n_embd, n_embd_gqa});
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
- layer.ffn_gate = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff}, backend_split);
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd}, backend_split);
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
+ layer.ffn_gate = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_GATE, "weight", i), {n_embd, n_ff});
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), { n_ff, n_embd});
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
}
} break;
case LLM_ARCH_GPT2:
{
- model.tok_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab}, GGML_BACKEND_CPU);
- model.pos_embd = ml.create_tensor(ctx, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train}, GGML_BACKEND_CPU);
+ model.tok_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_TOKEN_EMBD, "weight"), {n_embd, n_vocab});
+ model.pos_embd = ml.create_tensor(ctx_input, tn(LLM_TENSOR_POS_EMBD, "weight"), {n_embd, hparams.n_ctx_train});
// output
{
- ggml_backend_type backend_norm;
- ggml_backend_type backend_output;
-
- if (n_gpu_layers > int(n_layer)) {
- backend_norm = llama_backend_offload;
- backend_output = llama_backend_offload_split;
- } else {
- backend_norm = GGML_BACKEND_CPU;
- backend_output = GGML_BACKEND_CPU;
- }
-
- model.output_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd}, backend_norm);
- model.output_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd}, backend_norm);
- model.output = ml.create_tensor(ctx, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab}, backend_output);
+ model.output_norm = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "weight"), {n_embd});
+ model.output_norm_b = ml.create_tensor(ctx_output, tn(LLM_TENSOR_OUTPUT_NORM, "bias"), {n_embd});
+ model.output = ml.create_tensor(ctx_output_split, tn(LLM_TENSOR_OUTPUT, "weight"), {n_embd, n_vocab});
}
- const uint32_t n_ff = hparams.n_ff;
- const int64_t n_embd_gqa = n_embd_v_gqa;
- GGML_ASSERT(n_embd_gqa == n_embd / hparams.n_gqa());
- GGML_ASSERT(n_embd_gqa == n_embd_k_gqa);
-
- const int i_gpu_start = n_layer - n_gpu_layers;
-
- model.layers.resize(n_layer);
-
- for (uint32_t i = 0; i < n_layer; ++i) {
- const ggml_backend_type backend = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload; // NOLINT
- const ggml_backend_type backend_split = int(i) < i_gpu_start ? GGML_BACKEND_CPU : llama_backend_offload_split; // NOLINT
+ for (int i = 0; i < n_layer; ++i) {
+ ggml_context * ctx_layer = ctx_for_layer(i);
+ ggml_context * ctx_split = ctx_for_layer_split(i);
auto & layer = model.layers[i];
- layer.attn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd}, backend);
- layer.attn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd}, backend);
+ layer.attn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "weight", i), {n_embd});
+ layer.attn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_NORM, "bias", i), {n_embd});
- layer.wqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa}, backend_split);
- layer.bqkv = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa}, backend);
+ layer.wqkv = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_QKV, "weight", i), {n_embd, n_embd + 2*n_embd_gqa});
+ layer.bqkv = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_QKV, "bias", i), {n_embd + 2*n_embd_gqa});
- layer.wo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, backend_split);
- layer.bo = ml.create_tensor(ctx, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd}, backend);
+ layer.wo = ml.create_tensor(ctx_split, tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd});
+ layer.bo = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_ATTN_OUT, "bias", i), {n_embd});
- layer.ffn_norm = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, backend);
- layer.ffn_norm_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd}, backend);
+ layer.ffn_norm = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd});
+ layer.ffn_norm_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_NORM, "bias", i), {n_embd});
- layer.ffn_down = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd}, backend_split);
- layer.ffn_down_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd}, backend);
+ layer.ffn_down = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_DOWN, "weight", i), {n_ff, n_embd});
+ layer.ffn_down_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_DOWN, "bias", i), {n_embd});
- layer.ffn_up = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff}, backend_split);
- layer.ffn_up_b = ml.create_tensor(ctx, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff}, backend);
+ layer.ffn_up = ml.create_tensor(ctx_split, tn(LLM_TENSOR_FFN_UP, "weight", i), {n_embd, n_ff});
+ layer.ffn_up_b = ml.create_tensor(ctx_layer, tn(LLM_TENSOR_FFN_UP, "bias", i), {n_ff});
}
} break;
default:
ml.done_getting_tensors();
- ml.init_mapping();
+ ml.init_mapping(true, use_mlock ? &model.mlock_mmap : nullptr);
- // allocate tensors
- size_t vram_weights = 0;
- size_t buf_size = 0;
+ // create the backend buffers
+ std::vector<std::pair<ggml_context *, ggml_backend_buffer_t>> ctx_bufs;
- ggml_backend_buffer_type_t buft = llama_default_buffer_type(n_gpu_layers);
+ for (auto & it : ctx_map) {
+ ggml_backend_buffer_type_t buft = it.first;
+ ggml_context * ctx = it.second;
+ ggml_backend_buffer_t buf = nullptr;
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
- // GGML_BACKEND_GPU tensors are for CUDA and OpenCL only, which are handled separately without ggml-backend
- if (t->backend == GGML_BACKEND_CPU) {
- buf_size += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), ggml_backend_buft_get_alignment(buft));
- } else {
- vram_weights += ggml_nbytes(t);
+ // only the mmap region containing the tensors in the model is mapped to the backend buffer
+ // this is important for metal with apple silicon: if the entire model could be mapped to a metal buffer, then we could just use metal for all layers
+ // this allows using partial offloading when the model size exceeds the metal buffer size, but not the RAM size
+ if (ml.use_mmap && buft == llama_default_buffer_type_cpu(true)) {
+ size_t first, last;
+ ml.get_mapping_range(&first, &last, ctx);
+ buf = ggml_backend_cpu_buffer_from_ptr((char *) ml.mapping->addr + first, last - first);
}
- }
-
- // create backend buffer
- ggml_backend_buffer_t buf_mmap = nullptr;
-
#ifdef GGML_USE_METAL
- if (n_gpu_layers > 0) {
- if (ml.use_mmap) {
+ else if (ml.use_mmap && buft == ggml_backend_metal_buffer_type()) {
const size_t max_size = ggml_get_max_tensor_size(ctx);
- model.buf = ggml_backend_metal_buffer_from_ptr(ml.mapping->addr, ml.mapping->size, max_size);
- buf_mmap = model.buf;
- } else {
- model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_metal_buffer_type());
+ size_t first, last;
+ ml.get_mapping_range(&first, &last, ctx);
+ buf = ggml_backend_metal_buffer_from_ptr((char *) ml.mapping->addr + first, last - first, max_size);
}
- }
-#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
- // for testing only
- if (n_gpu_layers > 0) {
- model.buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, ggml_backend_cuda_buffer_type(0));
- }
#endif
-
- if (model.buf == nullptr) {
- // CPU backend, and indirectly CUDA and OpenCL
- if (ml.use_mmap) {
- model.buf = ggml_backend_cpu_buffer_from_ptr(ml.mapping->addr, ml.mapping->size);
- buf_mmap = model.buf;
- } else {
- // allocate only CPU tensors
- model.buf = ggml_backend_buft_alloc_buffer(buft, buf_size);
- ggml_tallocr_t alloc = ggml_tallocr_new_from_buffer(model.buf);
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != nullptr; t = ggml_get_next_tensor(ctx, t)) {
- if (t->backend == GGML_BACKEND_CPU) {
- ggml_tallocr_alloc(alloc, t);
- }
+ else {
+ buf = ggml_backend_alloc_ctx_tensors_from_buft(ctx, buft);
+ if (buf != nullptr && use_mlock && ggml_backend_buffer_is_host(buf)) {
+ model.mlock_buf.init (ggml_backend_buffer_get_base(buf));
+ model.mlock_buf.grow_to(ggml_backend_buffer_get_size(buf));
}
- ggml_tallocr_free(alloc);
}
- }
-
- if (use_mlock && ggml_backend_buffer_is_host(model.buf)) {
- model.mlock_buf.init (ggml_backend_buffer_get_base(model.buf));
- model.mlock_buf.grow_to(ggml_backend_buffer_get_size(model.buf));
+ if (buf == nullptr) {
+ throw std::runtime_error("failed to allocate buffer");
+ }
+ // indicate that this buffer contains weights
+ // this is used by ggml_backend_sched to improve op scheduling -> ops that use a weight are preferably scheduled to the backend that contains the weight
+ ggml_backend_buffer_set_usage(buf, GGML_BACKEND_BUFFER_USAGE_WEIGHTS);
+ model.bufs.push_back(buf);
+ ctx_bufs.emplace_back(ctx, buf);
}
// print memory requirements
{
- size_t sys_mem_required = ctx_size + buf_size;
-
- if (sys_mem_required > 0) {
- LLAMA_LOG_INFO("%s: system memory used = %7.2f MiB\n", __func__, sys_mem_required / 1024.0 / 1024.0);
- }
- if (vram_weights > 0) {
- LLAMA_LOG_INFO("%s: VRAM used = %7.2f MiB\n", __func__, vram_weights / 1024.0 / 1024.0);
- }
-
-#if (defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)) || defined(GGML_USE_CLBLAST)
const int n_gpu = std::min(n_gpu_layers, int(hparams.n_layer));
LLAMA_LOG_INFO("%s: offloading %d repeating layers to GPU\n", __func__, n_gpu);
const int max_offloadable_layers = hparams.n_layer + 1;
LLAMA_LOG_INFO("%s: offloaded %d/%d layers to GPU\n", __func__, std::min(n_gpu_layers, max_offloadable_layers), max_backend_supported_layers);
-#endif // defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST)
- }
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
- ggml_cuda_set_tensor_split(tensor_split);
-#else
- GGML_UNUSED(tensor_split);
-#endif // GGML_USE_CUBLAS
+ for (ggml_backend_buffer_t buf : model.bufs) {
+ LLAMA_LOG_INFO("%s: %10s buffer size = %8.2f MiB\n", __func__, ggml_backend_buffer_name(buf), ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
+ }
+ }
// populate tensors_by_name
- for (int i = 0; i < ml.n_tensors; ++i) {
- struct ggml_tensor * cur = ggml_get_tensor(ctx, ml.get_tensor_name(i));
- model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
+ for (ggml_context * ctx : model.ctxs) {
+ for (auto * cur = ggml_get_first_tensor(ctx); cur != NULL; cur = ggml_get_next_tensor(ctx, cur)) {
+ model.tensors_by_name.emplace_back(ggml_get_name(cur), cur);
+ }
}
- if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf_mmap, use_mlock ? &model.mlock_mmap : NULL)) {
- return false;
+ // load tensor data
+ for (auto & it : ctx_bufs) {
+ ggml_context * ctx = it.first;
+ ggml_backend_buffer_t buf = it.second;
+ if (!ml.load_all_data(ctx, progress_callback, progress_callback_user_data, buf, use_mlock ? &model.mlock_mmap : NULL)) {
+ return false;
+ }
}
model.mapping = std::move(ml.mapping);
}
if (!llm_load_tensors(
- ml, model, params.n_gpu_layers, params.main_gpu, params.tensor_split, params.use_mlock,
+ ml, model, params.n_gpu_layers, params.split_mode, params.main_gpu, params.tensor_split, params.use_mlock,
params.progress_callback, params.progress_callback_user_data
)) {
return -2;
}
} catch (const std::exception & err) {
- LLAMA_LOG_ERROR("error loading model: %s\n", err.what());
+ LLAMA_LOG_ERROR("%s: error loading model: %s\n", __func__, err.what());
return -1;
}
do_rope_shift (worst_case || kv_self.has_shift),
cb (cb),
buf_compute_meta (lctx.buf_compute_meta) {
- GGML_ASSERT(!!kv_self.ctx);
-
// all initializations should be done in init()
}
cb(Vcur, "Vcur", il);
}
+ // these nodes are added to the graph together so that they are not reordered
+ // by doing so, the number of splits in the graph is reduced
+ ggml_build_forward_expand(gf, Qcur);
+ ggml_build_forward_expand(gf, Kcur);
+ ggml_build_forward_expand(gf, Vcur);
+
Qcur = ggml_rope_custom(
ctx0, ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head, n_tokens), inp_pos,
hparams.n_rot, 0, 0, n_orig_ctx, freq_base, freq_scale,
}
};
-//
-// tensor offloading helpers
-//
-// TODO: will be removed with backend v2
-
-enum llm_offload_func_e {
- OFFLOAD_FUNC_NOP,
- OFFLOAD_FUNC,
- OFFLOAD_FUNC_FRC, // force offload
- OFFLOAD_FUNC_KQV,
- OFFLOAD_FUNC_NR,
- OFFLOAD_FUNC_EMB, // embeddings
- OFFLOAD_FUNC_OUT,
-};
-
-// TODO: will be removed with backend v2
-struct llm_offload_trie {
- struct node {
- ~node() {
- for (int i = 0; i < 256; ++i) {
- if (children[i]) {
- delete children[i];
- }
- }
- }
-
- node * children[256] = { nullptr };
- llm_offload_func_e func = OFFLOAD_FUNC_NOP;
- };
-
- llm_offload_trie() {
- root = new node;
- }
-
- llm_offload_trie(const std::unordered_map<const char *, llm_offload_func_e> & map) {
- root = new node;
-
- for (const auto & kv : map) {
- add(kv.first, kv.second);
- }
- }
-
- ~llm_offload_trie() {
- delete root;
- }
-
- void add(const char * name, llm_offload_func_e func) {
- node * cur = root;
-
- for (int i = 0; ; ++i) {
- const uint8_t c = name[i];
-
- if (!c) {
- break;
- }
-
- if (!cur->children[c]) {
- cur->children[c] = new node;
- }
-
- cur = cur->children[c];
- }
-
- cur->func = func;
- }
-
- llm_offload_func_e find(const char * name) const {
- const node * cur = root;
-
- for (int i = 0; ; ++i) {
- const uint8_t c = name[i];
-
- if (!c) {
- break;
- }
-
- if (!cur->children[c]) {
- return OFFLOAD_FUNC_NOP;
- }
-
- cur = cur->children[c];
- }
-
- return cur->func;
- }
-
- node * root = nullptr;
-};
-
-// TODO: will be removed with backend v2
-static const std::unordered_map<const char *, llm_offload_func_e> k_offload_map = {
- //{ "inp_tokens", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
- //{ "inp_embd", OFFLOAD_FUNC_NR }, // TODO: missing K-quants get_rows kernel
- { "pos_embd", OFFLOAD_FUNC_NR },
-
- { "inp_pos", OFFLOAD_FUNC_FRC }, // this is often used for KQ ops (e.g. rope)
- { "KQ_mask", OFFLOAD_FUNC_FRC },
- { "K_shift", OFFLOAD_FUNC_FRC },
-
- { "K_shifted", OFFLOAD_FUNC },
-
- { "inp_norm", OFFLOAD_FUNC_NR },
- { "inp_norm_w", OFFLOAD_FUNC_NR },
- { "inp_norm_wb", OFFLOAD_FUNC_NR },
-
- { "norm", OFFLOAD_FUNC },
- { "norm_w", OFFLOAD_FUNC },
- { "norm_wb", OFFLOAD_FUNC },
-
- { "attn_norm", OFFLOAD_FUNC },
- { "attn_norm_2", OFFLOAD_FUNC },
-
- { "wqkv", OFFLOAD_FUNC_KQV },
- { "bqkv", OFFLOAD_FUNC_KQV },
- { "wqkv_clamped", OFFLOAD_FUNC_KQV },
-
- { "tmpk", OFFLOAD_FUNC_KQV },
- { "tmpq", OFFLOAD_FUNC_KQV },
- { "tmpv", OFFLOAD_FUNC_KQV },
- { "Kcur", OFFLOAD_FUNC_KQV },
- { "Qcur", OFFLOAD_FUNC_KQV },
- { "Vcur", OFFLOAD_FUNC_KQV },
-
- { "krot", OFFLOAD_FUNC_KQV },
- { "qrot", OFFLOAD_FUNC_KQV },
- { "kpass", OFFLOAD_FUNC_KQV },
- { "qpass", OFFLOAD_FUNC_KQV },
- { "krotated", OFFLOAD_FUNC_KQV },
- { "qrotated", OFFLOAD_FUNC_KQV },
-
- { "q", OFFLOAD_FUNC_KQV },
- { "k", OFFLOAD_FUNC_KQV },
- { "kq", OFFLOAD_FUNC_KQV },
- { "kq_scaled", OFFLOAD_FUNC_KQV },
- { "kq_scaled_alibi", OFFLOAD_FUNC_KQV },
- { "kq_masked", OFFLOAD_FUNC_KQV },
- { "kq_soft_max", OFFLOAD_FUNC_KQV },
- { "kq_soft_max_ext", OFFLOAD_FUNC_KQV },
- { "v", OFFLOAD_FUNC_KQV },
- { "kqv", OFFLOAD_FUNC_KQV },
- { "kqv_merged", OFFLOAD_FUNC_KQV },
- { "kqv_merged_cont", OFFLOAD_FUNC_KQV },
- { "kqv_wo", OFFLOAD_FUNC_KQV },
- { "kqv_out", OFFLOAD_FUNC_KQV },
-
- { "ffn_inp", OFFLOAD_FUNC },
- { "ffn_norm", OFFLOAD_FUNC },
-
- { "ffn_up", OFFLOAD_FUNC },
- { "ffn_up_b", OFFLOAD_FUNC },
- { "ffn_gate", OFFLOAD_FUNC },
- { "ffn_gate_b", OFFLOAD_FUNC },
- { "ffn_gate_par", OFFLOAD_FUNC },
- { "ffn_act", OFFLOAD_FUNC },
- { "ffn_down", OFFLOAD_FUNC },
- { "ffn_down_b", OFFLOAD_FUNC },
- { "ffn_out", OFFLOAD_FUNC },
-
- { "ffn_silu", OFFLOAD_FUNC },
- { "ffn_gelu", OFFLOAD_FUNC },
- { "ffn_relu", OFFLOAD_FUNC },
- { "ffn_sqr(relu)", OFFLOAD_FUNC },
-
- { "ffn_moe_logits", OFFLOAD_FUNC },
- { "ffn_moe_probs", OFFLOAD_FUNC },
- { "ffn_moe_argsort", OFFLOAD_FUNC },
- { "ffn_moe_weights", OFFLOAD_FUNC },
- { "ffn_moe_weights_sum", OFFLOAD_FUNC },
- { "ffn_moe_weights_norm", OFFLOAD_FUNC },
- { "ffn_moe_weighted", OFFLOAD_FUNC },
- { "ffn_moe_up", OFFLOAD_FUNC },
- { "ffn_moe_gate", OFFLOAD_FUNC },
- { "ffn_moe_silu", OFFLOAD_FUNC },
- { "ffn_moe_gate_par", OFFLOAD_FUNC },
- { "ffn_moe_down", OFFLOAD_FUNC },
- { "ffn_moe_out", OFFLOAD_FUNC },
-
- { "l_out", OFFLOAD_FUNC },
-
- { "result_norm", OFFLOAD_FUNC_EMB },
- { "result_output_no_bias", OFFLOAD_FUNC_EMB },
- { "result_output", OFFLOAD_FUNC_OUT },
-};
-
-static llm_offload_trie k_offload_func_trie(k_offload_map);
-
static struct ggml_cgraph * llama_build_graph(
llama_context & lctx,
const llama_batch & batch) {
const auto & model = lctx.model;
// check if we should build the worst-case graph (for memory measurement)
- const bool worst_case = ggml_allocr_is_measure(lctx.alloc);
+ const bool worst_case = ggml_tallocr_is_measure(lctx.alloc);
// keep track of the input that has already been allocated
bool alloc_inp_tokens = false;
bool alloc_inp_KQ_mask = false;
bool alloc_inp_K_shift = false;
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
- const bool do_offload = true;
-#else
- const bool do_offload = true; // TODO: set to false after finishing refactoring
-#endif
-
- int n_non_view = 0; // number of non-view tensors that have been processed by the callback
-
// this callback allows us to apply custom logic to each tensor (e.g. ggml-alloc, offloading, etc.)
- // TODO: will be removed with backend v2
+ // TODO: improve handling of input and output tensors, then replace this with ggml_set_name
llm_build_cb cb = [&](struct ggml_tensor * cur, const char * name, int il) {
if (il >= 0) {
ggml_format_name(cur, "%s-%d", name, il);
//
// allocate input tensors and set input data
//
- // TODO: will be removed with backend v2
if (!alloc_inp_tokens && strcmp(name, "inp_tokens") == 0) {
- ggml_allocr_alloc(lctx.alloc, cur);
+ ggml_tallocr_alloc(lctx.alloc, cur);
- if (!ggml_allocr_is_measure(lctx.alloc) && batch.token) {
+ if (!ggml_tallocr_is_measure(lctx.alloc) && batch.token) {
const int64_t n_tokens = cur->ne[0];
ggml_backend_tensor_set(cur, batch.token, 0, n_tokens*ggml_element_size(cur));
alloc_inp_tokens = true;
}
- if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0) {
- ggml_allocr_alloc(lctx.alloc, cur);
+ if (!alloc_inp_embd && strcmp(name, "inp_embd") == 0 && batch.embd) {
+ ggml_tallocr_alloc(lctx.alloc, cur);
- if (!ggml_allocr_is_measure(lctx.alloc) && batch.embd) {
+ if (!ggml_tallocr_is_measure(lctx.alloc) && batch.embd) {
const int64_t n_embd = cur->ne[0];
const int64_t n_tokens = cur->ne[1];
}
if (!alloc_inp_pos && strcmp(name, "inp_pos") == 0) {
- ggml_allocr_alloc(lctx.alloc, cur);
+ ggml_tallocr_alloc(lctx.alloc, cur);
- if (!ggml_allocr_is_measure(lctx.alloc) && batch.pos) {
+ if (!ggml_tallocr_is_measure(lctx.alloc) && batch.pos) {
const int64_t n_tokens = cur->ne[0];
static_assert(std::is_same<llama_pos, int32_t>::value, "llama_pos must be int32_t");
}
if (!alloc_inp_KQ_mask && strcmp(name, "KQ_mask") == 0) {
- ggml_allocr_alloc(lctx.alloc, cur);
+ ggml_tallocr_alloc(lctx.alloc, cur);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
+ if (!ggml_tallocr_is_measure(lctx.alloc)) {
const int64_t n_kv = cur->ne[0];
const int64_t n_tokens = cur->ne[1];
}
if (!alloc_inp_K_shift && strcmp(name, "K_shift") == 0) {
- ggml_allocr_alloc(lctx.alloc, cur);
+ ggml_tallocr_alloc(lctx.alloc, cur);
- if (!ggml_allocr_is_measure(lctx.alloc)) {
+ if (!ggml_tallocr_is_measure(lctx.alloc)) {
const int64_t n_ctx = cur->ne[0];
int32_t * data;
alloc_inp_K_shift = true;
}
-
- // view tensors are not processed further
- if (cur->view_src != nullptr) {
- return;
- }
-
- if (cur->op != GGML_OP_NONE) {
- n_non_view++;
- }
-
- //
- // offload layers
- //
- // TODO: will be removed with backend v2
-
-//#define LLAMA_OFFLOAD_DEBUG
-
- if (!do_offload) {
- return;
- }
-
- const int n_layer = model.hparams.n_layer;
-
- const int n_gpu_layers = model.n_gpu_layers;
- const int i_gpu_start = n_layer - n_gpu_layers;
-
- // should we offload the final norm? yes if we are not computing embeddings
- const bool offload_emb = lctx.embedding.empty();
-
- static const std::unordered_map<llm_offload_func_e, std::string, std::hash<int>> k_offload_func_name = {
- { OFFLOAD_FUNC_NOP, "CPU" },
- { OFFLOAD_FUNC_OUT, "CPU" },
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
- { OFFLOAD_FUNC, "GPU (CUDA)" },
- { OFFLOAD_FUNC_FRC, "GPU (CUDA) FRC" },
- { OFFLOAD_FUNC_KQV, "GPU (CUDA) KQV" },
- { OFFLOAD_FUNC_NR, "GPU (CUDA) NR" },
- { OFFLOAD_FUNC_EMB, "GPU (CUDA) EMB" },
-#else
- { OFFLOAD_FUNC, "CPU" },
- { OFFLOAD_FUNC_FRC, "CPU" },
- { OFFLOAD_FUNC_KQV, "CPU" },
- { OFFLOAD_FUNC_NR, "CPU" },
- { OFFLOAD_FUNC_EMB, "CPU" },
-#endif // GGML_USE_CUBLAS
- };
-
- // check the global map for what offload function to use for this tensor
- llm_offload_func_e func_e = k_offload_func_trie.find(name);
-
- if (func_e == OFFLOAD_FUNC_NOP) {
-#ifdef LLAMA_OFFLOAD_DEBUG
- // if a tensor hasn't been offloaded, we warn the user
- if (worst_case) {
- LLAMA_LOG_WARN("%s: %32s: not offloaded (ref: %s)\n", __func__,
- cur->name, "https://github.com/ggerganov/llama.cpp/pull/3837");
- }
-#endif
-
- return;
- }
-
- // count the number of layers and respect the provided n_gpu_layers
- switch (func_e) {
- case OFFLOAD_FUNC_NOP:
- case OFFLOAD_FUNC_OUT:
- break;
- case OFFLOAD_FUNC:
- if (n_gpu_layers < n_layer) {
- if (il < i_gpu_start) {
- func_e = OFFLOAD_FUNC_NOP;
- }
- }
- break;
- case OFFLOAD_FUNC_FRC:
- if (!lctx.cparams.offload_kqv) {
- func_e = OFFLOAD_FUNC_NOP;
- } break;
- case OFFLOAD_FUNC_KQV:
- if (!lctx.cparams.offload_kqv) {
- func_e = OFFLOAD_FUNC_NOP;
- } else {
- if (n_gpu_layers < n_layer) {
- if (il < i_gpu_start) {
- func_e = OFFLOAD_FUNC_NOP;
- }
- }
- }
- break;
- case OFFLOAD_FUNC_NR:
- if (n_gpu_layers <= n_layer + 0) {
- func_e = OFFLOAD_FUNC_NOP;
- }
- break;
- case OFFLOAD_FUNC_EMB:
- if (!offload_emb || n_gpu_layers < n_layer) {
- func_e = OFFLOAD_FUNC_NOP;
- }
- break;
- default: GGML_ASSERT(false);
- }
-
- offload_func_t func = ggml_offload_nop;
-
- // this is needed for compatibility with Metal for example
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
- static offload_func_t ggml_offload_gpu = ggml_cuda_assign_buffers_no_alloc;
-#else
- static offload_func_t ggml_offload_gpu = ggml_offload_nop;
-#endif
-
- switch (func_e) {
- case OFFLOAD_FUNC_NOP:
- case OFFLOAD_FUNC_OUT: func = ggml_offload_nop; break;
- case OFFLOAD_FUNC:
- case OFFLOAD_FUNC_KQV:
- case OFFLOAD_FUNC_FRC:
- case OFFLOAD_FUNC_NR:
- case OFFLOAD_FUNC_EMB: func = ggml_offload_gpu; break;
- default: GGML_ASSERT(false);
- }
-
- // apply offload function to the tensor
- func(cur);
-
-#ifdef LLAMA_OFFLOAD_DEBUG
- if (worst_case) {
- LLAMA_LOG_INFO("%s: %32s: %s\n", __func__, cur->name, k_offload_func_name.at(func_e).c_str());
- }
-#endif
};
struct ggml_cgraph * result = NULL;
llm.free();
- if (worst_case) {
- int n_non_view_total = 0;
-
- for (int i = 0; i < result->n_nodes; ++i) {
- if (result->nodes[i]->view_src == nullptr) {
- n_non_view_total++;
- }
- }
-
- LLAMA_LOG_INFO("%s: non-view tensors processed: %d/%d\n", __func__, n_non_view, n_non_view_total);
-
- if (n_non_view != n_non_view_total) {
- LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
- LLAMA_LOG_WARN("%s: not all non-view tensors have been processed with a callback\n", __func__);
- LLAMA_LOG_WARN("%s: this can indicate an inefficiency in the graph implementation\n", __func__);
- LLAMA_LOG_WARN("%s: build with LLAMA_OFFLOAD_DEBUG for more info\n", __func__);
- LLAMA_LOG_WARN("%s: ref: https://github.com/ggerganov/llama.cpp/pull/3837\n", __func__);
- LLAMA_LOG_WARN("%s: ****************************************************************\n", __func__);
- }
- }
-
return result;
}
auto & kv_self = lctx.kv_self;
- GGML_ASSERT(!!kv_self.ctx);
-
const int64_t n_embd = hparams.n_embd;
const int64_t n_vocab = hparams.n_vocab;
//printf("kv_self.n = %5d, kv_self.used = %5d, kv_self.head = %5d\n", kv_self.n, kv_self.used, kv_self.head);
- ggml_allocr_reset(lctx.alloc);
+ ggml_backend_sched_reset(lctx.sched);
ggml_cgraph * gf = llama_build_graph(lctx, batch);
- ggml_allocr_alloc_graph(lctx.alloc, gf);
-
// the output is always the last tensor in the graph
struct ggml_tensor * res = gf->nodes[gf->n_nodes - 1];
GGML_ASSERT(strcmp(res->name, "result_output") == 0);
GGML_ASSERT(strcmp(embeddings->name, "result_norm") == 0);
}
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
- char * buf_alloc_base = (char *)ggml_backend_buffer_get_base(lctx.buf_alloc);
- for (int i = 0; i < gf->n_leafs; i++) {
- ggml_tensor * node = gf->leafs[i];
- if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
- ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
- ggml_cuda_copy_to_device(node);
- }
- }
-
- for (int i = 0; i < gf->n_nodes; i++) {
- ggml_tensor * node = gf->nodes[i];
- if (node->backend == GGML_BACKEND_GPU && node->extra == NULL) {
- ggml_cuda_assign_scratch_offset(node, (char *)node->data - buf_alloc_base);
- }
- }
-
- // HACK: ggml-alloc may change the tensor backend when reusing a parent, so force output to be on the CPU here if needed
- if (!lctx.embedding.empty()) {
- embeddings->backend = GGML_BACKEND_CPU;
- }
- res->backend = GGML_BACKEND_CPU;
-#endif
-
// LLAMA_LOG_INFO("graph build time: %.3f ms (%d nodes, %d leafs)\n", (ggml_time_us() - t_start_us)/1000.0, gf->n_nodes, gf->n_leafs);
// for big prompts, if BLAS is enabled, it is better to use only one thread
#endif
#ifdef GGML_USE_METAL
- if (ggml_backend_is_metal(lctx.backend)) {
- ggml_backend_metal_set_n_cb(lctx.backend, n_threads);
+ if (ggml_backend_is_metal(lctx.backend_metal)) {
+ ggml_backend_metal_set_n_cb(lctx.backend_metal, n_threads);
}
#endif
- if (ggml_backend_is_cpu(lctx.backend)) {
- ggml_backend_cpu_set_n_threads(lctx.backend, n_threads);
+ if (lctx.backend_cpu != nullptr) {
+ ggml_backend_cpu_set_n_threads(lctx.backend_cpu, n_threads);
}
- ggml_backend_graph_compute(lctx.backend, gf);
+ ggml_backend_sched_graph_compute(lctx.sched, gf);
+
+ // fprintf(stderr, "splits: %d\n", ggml_backend_sched_get_n_splits(lctx.sched));
#ifdef GGML_USE_MPI
ggml_mpi_graph_compute_post(lctx.ctx_mpi, gf, n_layer);
logits_out.clear();
#endif
+ ggml_backend_t res_backend = ggml_backend_sched_get_node_backend(lctx.sched, res);
+ GGML_ASSERT(res_backend != nullptr);
if (batch.logits) {
logits_out.resize(n_vocab * n_tokens);
for (uint32_t i = 0; i < n_tokens; i++) {
if (batch.logits[i] == 0) {
continue;
}
- ggml_backend_tensor_get(res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
+ ggml_backend_tensor_get_async(res_backend, res, logits_out.data() + (n_vocab*i), (n_vocab*i)*sizeof(float), n_vocab*sizeof(float));
#ifndef NDEBUG
logits_valid[i] = true;
#endif
}
} else if (lctx.logits_all) {
logits_out.resize(n_vocab * n_tokens);
- ggml_backend_tensor_get(res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
+ ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), 0, n_vocab*n_tokens*sizeof(float));
#ifndef NDEBUG
std::fill(logits_valid.begin(), logits_valid.end(), true);
#endif
} else {
logits_out.resize(n_vocab);
- ggml_backend_tensor_get(res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
+ ggml_backend_tensor_get_async(res_backend, res, logits_out.data(), (n_vocab*(n_tokens - 1))*sizeof(float), n_vocab*sizeof(float));
#ifndef NDEBUG
logits_valid[0] = true;
#endif
}
+ ggml_backend_synchronize(res_backend);
}
// extract embeddings
auto & embedding_out = lctx.embedding;
embedding_out.resize(n_embd);
- ggml_backend_tensor_get(embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
+ ggml_backend_t embeddings_backend = ggml_backend_sched_get_node_backend(lctx.sched, embeddings);
+ ggml_backend_tensor_get_async(embeddings_backend, embeddings, embedding_out.data(), (n_embd*(n_tokens - 1))*sizeof(float), n_embd*sizeof(float));
+ ggml_backend_synchronize(embeddings_backend);
}
// measure the performance only for the single-token evals
LLAMA_LOG_INFO("%s: r = %d, alpha = %d, scaling = %.2f\n", __func__, lora_r, lora_alpha, scaling);
- // create a name -> tensor map of the model to accelerate lookups
- // find the max tensor size to estimate the required temporary buffer size
- size_t max_tensor_size = 0;
- std::unordered_map<std::string, struct ggml_tensor*> model_tensors;
- for (const auto & kv : model.tensors_by_name) {
- model_tensors.insert(kv);
- size_t f32_size = ggml_nelements(kv.second) * sizeof(float);
- max_tensor_size = std::max(max_tensor_size, f32_size);
- }
-
- // create a temporary ggml context to store the lora tensors
- // TODO: use ggml-alloc
- size_t lora_ctx_size = max_tensor_size * 3;
- LLAMA_LOG_INFO("%s: allocating %.f MB for lora temporary buffer\n", __func__, lora_ctx_size / 1024.0 / 1024.0);
- std::vector<uint8_t> lora_buf(lora_ctx_size);
-
- struct ggml_init_params params;
- params.mem_size = lora_buf.size();
- params.mem_buffer = lora_buf.data();
- params.no_alloc = false;
-
- using unique_context = std::unique_ptr<ggml_context, decltype(&ggml_free)>;
-
- unique_context lora_ctx(nullptr, ggml_free);
- lora_ctx.reset(ggml_init(params));
- std::unordered_map<std::string, struct ggml_tensor *> lora_tensors;
-
// load base model
std::unique_ptr<llama_model_loader> ml;
-
- if (path_base_model) {
+ if (path_base_model) {
LLAMA_LOG_INFO("%s: loading base model from '%s'\n", __func__, path_base_model);
ml.reset(new llama_model_loader(path_base_model, /*use_mmap*/ true, /*kv_overrides*/ nullptr));
- ml->init_mapping(false); // no prefetching
+ ml->init_mapping(/*prefetch*/ false); // no prefetching
}
- // read tensors and apply
- bool warned = false;
- int n_tensors = 0;
-
- std::vector<uint8_t> work_buffer;
+ struct tensor_meta {
+ std::string name;
+ ggml_type type;
+ int32_t ne[2];
+ size_t offset;
+ };
+ std::map<std::string, tensor_meta> tensor_meta_map;
+ // load all tensor meta
while (true) {
if (fin.tell() == fin.size) {
// eof
fin.read_raw(&n_dims, sizeof(n_dims));
fin.read_raw(&name_len, sizeof(name_len));
- fin.read_raw(&ftype, sizeof(ftype));
+ fin.read_raw(&ftype, sizeof(ftype));
if (n_dims != 1 && n_dims != 2) {
LLAMA_LOG_ERROR("%s: unsupported tensor dimension %d\n", __func__, n_dims);
std::string name;
{
- GGML_ASSERT(name_len <= 1024);
- char buf[1024];
+ GGML_ASSERT(name_len < GGML_MAX_NAME);
+ char buf[GGML_MAX_NAME];
fin.read_raw(buf, name_len);
name = std::string(buf, name_len);
}
- // check for lora suffix and get the type of tensor
- const std::string lora_suffix = ".lora";
- size_t pos = name.rfind(lora_suffix);
- if (pos == std::string::npos) {
- LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
- return 1;
+ // check for lora suffix
+ std::string lora_suffix;
+ if (name.length() > 6) {
+ lora_suffix = name.substr(name.length() - 6);
}
-
- std::string lora_type = name.substr(pos + lora_suffix.length());
- std::string base_name = name;
- base_name.erase(pos);
- // LLAMA_LOG_INFO("%s: %s => %s (lora type %s) \n", __func__, name.c_str(), base_name.c_str(), lora_type.c_str());
-
- if (model_tensors.find(base_name) == model_tensors.end()) {
- LLAMA_LOG_ERROR("%s: unknown tensor '%s' in lora adapter\n", __func__, name.data());
+ if (lora_suffix != ".loraA" && lora_suffix != ".loraB") {
+ LLAMA_LOG_ERROR("%s: error: '%s' is not a lora tensor\n", __func__, name.c_str());
return 1;
}
- // create ggml tensor
+ // tensor type
ggml_type wtype;
switch (ftype) {
case 0: wtype = GGML_TYPE_F32; break;
return false;
}
}
- ggml_tensor * lora_tensor = ggml_new_tensor_2d(lora_ctx.get(), wtype, ne[0], ne[1]);
- ggml_set_name(lora_tensor, name.c_str());
- // load tensor data
+ // data offset
size_t offset = fin.tell();
- size_t tensor_data_size = ggml_nbytes(lora_tensor);
offset = (offset + 31) & -32;
- fin.seek(offset, SEEK_SET);
- fin.read_raw(lora_tensor->data, tensor_data_size);
- lora_tensors[name] = lora_tensor;
+ // skip tensor data
+ fin.seek(offset + ggml_row_size(wtype, ne[0]) * ne[1], SEEK_SET);
+
+ tensor_meta_map.emplace(name, tensor_meta{ name, wtype, { ne[0], ne[1] }, offset });
+ }
- // check if we have both A and B tensors and apply
- if (lora_tensors.find(base_name + ".loraA") != lora_tensors.end() &&
- lora_tensors.find(base_name + ".loraB") != lora_tensors.end()) {
+ bool warned = false;
+ int n_tensors = 0;
- ggml_tensor * dest_t = model_tensors[base_name];
+ // apply
+ ggml_backend_t backend_cpu = ggml_backend_cpu_init();
+ if (backend_cpu == nullptr) {
+ LLAMA_LOG_ERROR("%s: error: failed to initialize cpu backend\n", __func__);
+ return 1;
+ }
+ ggml_backend_cpu_set_n_threads(backend_cpu, n_threads);
- offload_func_t offload_func = ggml_offload_nop;
- offload_func_t offload_func_force_inplace = ggml_offload_nop;
+ std::vector<no_init<uint8_t>> read_buf;
+ for (const auto & it : model.tensors_by_name) {
+ const std::string & base_name = it.first;
+ ggml_tensor * model_t = it.second;
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
- if (dest_t->backend == GGML_BACKEND_GPU || dest_t->backend == GGML_BACKEND_GPU_SPLIT) {
- if (dest_t->type != GGML_TYPE_F16) {
- throw std::runtime_error(format(
- "%s: error: the simultaneous use of LoRAs and GPU acceleration is only supported for f16 models. dest_t->type: %d", __func__, dest_t->type));
- }
- offload_func = ggml_cuda_assign_buffers;
- offload_func_force_inplace = ggml_cuda_assign_buffers_force_inplace;
- }
-#endif // GGML_USE_CUBLAS
+ if (tensor_meta_map.find(base_name + ".loraA") == tensor_meta_map.end() ||
+ tensor_meta_map.find(base_name + ".loraB") == tensor_meta_map.end()) {
+ continue;
+ }
- ggml_tensor * base_t;
- if (ml) {
- struct gguf_context * ctx_gguf = ml->ctx_gguf;
+ tensor_meta & metaA = tensor_meta_map.at(base_name + ".loraA");
+ tensor_meta & metaB = tensor_meta_map.at(base_name + ".loraB");
- // load from base model
- if (gguf_find_tensor(ctx_gguf, base_name.c_str()) < 0) {
- LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
- return 1;
- }
+ ggml_init_params lora_init_params = {
+ /* .mem_size */ ggml_tensor_overhead()*128 + ggml_graph_overhead(),
+ /* .mem_buffer */ nullptr,
+ /* .no_alloc */ true,
+ };
+ ggml_context * lora_ctx = ggml_init(lora_init_params);
+ if (lora_ctx == nullptr) {
+ LLAMA_LOG_ERROR("%s: error: failed to initialize lora context\n", __func__);
+ ggml_backend_free(backend_cpu);
+ return 1;
+ }
- base_t = ml->get_tensor_meta(base_name.c_str());
- ml->load_data_for(base_t);
- } else {
- base_t = dest_t;
- }
+ // create tensors
+ ggml_tensor * loraA = ggml_new_tensor_2d(lora_ctx, metaA.type, metaA.ne[0], metaA.ne[1]);
+ ggml_tensor * loraB = ggml_new_tensor_2d(lora_ctx, metaB.type, metaB.ne[0], metaB.ne[1]);
+ ggml_set_name(loraA, metaA.name.c_str());
+ ggml_set_name(loraB, metaB.name.c_str());
- if (ggml_is_quantized(base_t->type)) {
- if (!warned) {
- LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
- "use a f16 or f32 base model with --lora-base\n", __func__);
- warned = true;
- }
+ ggml_tensor * base_t;
+ if (ml) {
+ if (gguf_find_tensor(ml->ctx_gguf, base_name.c_str()) < 0) {
+ LLAMA_LOG_ERROR("%s: error: tensor '%s' not found in base model\n", __func__, base_name.c_str());
+ return 1;
}
+ base_t = ggml_dup_tensor(lora_ctx, ml->get_tensor_meta(base_name.c_str()));
+ } else {
+ base_t = ggml_dup_tensor(lora_ctx, model_t);
+ }
+ ggml_set_name(base_t, base_name.c_str());
- ggml_tensor * loraA = lora_tensors[base_name + ".loraA"];
- GGML_ASSERT(loraA->type == GGML_TYPE_F32);
- ggml_set_name(loraA, "loraA");
+ // allocate in backend buffer
+ ggml_backend_buffer_t lora_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
+ if (lora_buf == nullptr) {
+ LLAMA_LOG_ERROR("%s: error: failed to allocate lora tensors\n", __func__);
+ return 1;
+ }
- ggml_tensor * loraB = lora_tensors[base_name + ".loraB"];
- GGML_ASSERT(loraB->type == GGML_TYPE_F32);
- ggml_set_name(loraB, "loraB");
+ // load tensor data
+ auto load_tensor = [&read_buf, &fin](const tensor_meta & tensor_meta, ggml_tensor * tensor) {
+ read_buf.resize(ggml_nbytes(tensor));
+ fin.seek(tensor_meta.offset, SEEK_SET);
+ fin.read_raw(read_buf.data(), ggml_nbytes(tensor));
+ ggml_backend_tensor_set(tensor, read_buf.data(), 0, read_buf.size());
+ };
+ load_tensor(metaA, loraA);
+ load_tensor(metaB, loraB);
- if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
- LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
- " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
- return 1;
- }
+ // load base model tensor data
+ if (ml) {
+ ml->load_data_for(base_t);
+ } else {
+ ggml_backend_tensor_copy(model_t, base_t);
+ }
+
+ if (ggml_is_quantized(base_t->type) && !warned) {
+ LLAMA_LOG_WARN("%s: warning: using a lora adapter with a quantized model may result in poor quality, "
+ "use a f16 or f32 base model with --lora-base\n", __func__);
+ warned = true;
+ }
+
+ if (base_t->ne[0] != loraA->ne[1] || base_t->ne[1] != loraB->ne[1]) {
+ LLAMA_LOG_ERROR("%s: incompatible tensor dimensions (%" PRId64 " and %" PRId64 ");"
+ " are you sure that this adapter is for this model?\n", __func__, base_t->ne[0], loraA->ne[1]);
+ ggml_free(lora_ctx);
+ ggml_backend_buffer_free(lora_buf);
+ ggml_backend_free(backend_cpu);
+ return 1;
+ }
+ auto build_lora_graph = [&]() {
// w = w + BA*s
- ggml_tensor * BA = ggml_mul_mat(lora_ctx.get(), loraA, loraB);
- offload_func(BA);
+ ggml_tensor * BA = ggml_mul_mat(lora_ctx, loraA, loraB);
ggml_set_name(BA, "BA");
if (scaling != 1.0f) {
- BA = ggml_scale_inplace(lora_ctx.get(), BA, scaling);
- offload_func(BA);
+ BA = ggml_scale(lora_ctx, BA, scaling);
ggml_set_name(BA, "BA_scaled");
}
ggml_tensor * r;
- if (base_t == dest_t) {
- r = ggml_add_inplace(lora_ctx.get(), dest_t, BA);
- offload_func_force_inplace(r);
- ggml_set_name(r, "r_add_inplace");
- }
- else {
- r = ggml_add(lora_ctx.get(), base_t, BA);
- offload_func(r);
- ggml_set_name(r, "r_add");
+ r = ggml_add_inplace(lora_ctx, base_t, BA);
+ ggml_set_name(r, "r_add");
- r = ggml_cpy(lora_ctx.get(), r, dest_t);
- offload_func(r);
- ggml_set_name(r, "r_cpy");
+ if (base_t->type != model_t->type) {
+ // convert the result to the model type
+ r = ggml_cast(lora_ctx, r, model_t->type);
+ ggml_set_name(r, "r_cast");
}
- struct ggml_cgraph * gf = ggml_new_graph(lora_ctx.get());
- ggml_build_forward_expand(gf, r);
+ return r;
+ };
+
+ ggml_cgraph * gf = ggml_new_graph(lora_ctx);
+ ggml_tensor * r = build_lora_graph();
+ ggml_build_forward_expand(gf, r);
- ggml_graph_compute_helper(work_buffer, gf, n_threads);
+ ggml_backend_buffer_t graph_buf = ggml_backend_alloc_ctx_tensors_from_buft(lora_ctx, ggml_backend_cpu_buffer_type());
+ if (graph_buf == nullptr) {
+ LLAMA_LOG_ERROR("%s: error: failed to allocate graph tensors\n", __func__);
+ ggml_free(lora_ctx);
+ ggml_backend_buffer_free(lora_buf);
+ ggml_backend_free(backend_cpu);
+ return 1;
+ }
- // the tensors in the adapter must be sorted such that loraA and loraB of the same tensor are next to each other
- GGML_ASSERT(lora_tensors.size() == 2);
+ ggml_backend_graph_compute(backend_cpu, gf);
- // we won't need these tensors again, reset the context to save memory
- lora_ctx.reset(ggml_init(params));
- lora_tensors.clear();
+ ggml_backend_tensor_set(model_t, r->data, 0, ggml_nbytes(r));
- n_tensors++;
- if (n_tensors % 4 == 0) {
- LLAMA_LOG_INFO(".");
- }
+#if 0
+ // TODO: use scheduler with fallback to CPU for less copies between CPU and GPU
+ //ggml_backend_sched_t sched = ggml_backend_sched_new(backends.data(), backends.size(), GGML_DEFAULT_GRAPH_SIZE);
+
+ // sched compute
+ ggml_build_forward_expand(gf, build_graph());
+ ggml_backend_sched_init_measure(sched, gf);
+
+ // create the graph again, since the previous one was destroyed by the measure
+ ggml_graph_clear(gf);
+ ggml_build_forward_expand(gf, build_graph());
+ ggml_backend_sched_graph_compute(sched, gf);
+ ggml_backend_sched_free(sched);
+#endif
+
+ ggml_backend_buffer_free(lora_buf);
+ ggml_backend_buffer_free(graph_buf);
+ ggml_free(lora_ctx);
+
+ n_tensors++;
+ if (n_tensors % 4 == 0) {
+ LLAMA_LOG_INFO(".");
}
}
+ ggml_backend_free(backend_cpu);
+
const int64_t t_lora_us = ggml_time_us() - t_start_lora_us;
LLAMA_LOG_INFO(" done (%.2f ms)\n", t_lora_us / 1000.0);
struct llama_model_params llama_model_default_params() {
struct llama_model_params result = {
/*.n_gpu_layers =*/ 0,
+ /*.split_mode =*/ LLAMA_SPLIT_LAYER,
/*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr,
/*.progress_callback =*/ nullptr,
};
#ifdef GGML_USE_METAL
- result.n_gpu_layers = 1;
+ // note: we usually have plenty of VRAM, so by default offload all layers to the GPU
+ result.n_gpu_layers = 999;
#endif
return result;
GGML_ASSERT(hparams.n_embd_head_k % ggml_blck_size(type_k) == 0);
GGML_ASSERT(hparams.n_embd_head_v % ggml_blck_size(type_v) == 0);
- // reserve memory for context buffers
if (!hparams.vocab_only) {
- // initialize backend
+ // initialize backends
#ifdef GGML_USE_METAL
if (model->n_gpu_layers > 0) {
- ctx->backend = ggml_backend_metal_init();
- if (ctx->backend == nullptr) {
+ ctx->backend_metal = ggml_backend_metal_init();
+ if (ctx->backend_metal == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize Metal backend\n", __func__);
+ llama_free(ctx);
+ return nullptr;
}
+ ctx->backends.push_back(ctx->backend_metal);
}
-#elif defined(GGML_USE_CUBLAS) && defined(LLAMA_GGML_BACKEND_CUDA_TEST)
- // for testing only
+#elif defined(GGML_USE_CUBLAS)
if (model->n_gpu_layers > 0) {
- ctx->backend = ggml_backend_cuda_init(0);
- if (ctx->backend == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to initialize CUDA backend\n", __func__);
+ // with split_mode LLAMA_SPLIT_NONE or LLAMA_SPLIT_ROW, only the main GPU backend is used
+ if (model->split_mode == LLAMA_SPLIT_NONE || model->split_mode == LLAMA_SPLIT_ROW) {
+ ggml_backend_t backend = ggml_backend_cuda_init(model->main_gpu);
+ if (backend == nullptr) {
+ LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, model->main_gpu);
+ llama_free(ctx);
+ return nullptr;
+ }
+ ctx->backends.push_back(backend);
+ } else {
+ // LLAMA_SPLIT_LAYER requires a backend for each GPU
+ for (int device = 0; device < ggml_backend_cuda_get_device_count(); ++device) {
+ ggml_backend_t backend = ggml_backend_cuda_init(device);
+ if (backend == nullptr) {
+ LLAMA_LOG_ERROR("%s: failed to initialize CUDA%d backend\n", __func__, device);
+ llama_free(ctx);
+ return nullptr;
+ }
+ ctx->backends.push_back(backend);
+ }
}
}
#endif
-
- if (ctx->backend == nullptr && ggml_backend_buffer_is_host(model->buf)) {
- ctx->backend = ggml_backend_cpu_init();
- if (ctx->backend == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
- }
- }
-
- if (ctx->backend == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to initialize a backend\n", __func__);
- delete ctx;
+ ctx->backend_cpu = ggml_backend_cpu_init();
+ if (ctx->backend_cpu == nullptr) {
+ LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);
+ llama_free(ctx);
return nullptr;
}
+ ctx->backends.push_back(ctx->backend_cpu);
- if (!llama_kv_cache_init(ctx->model.hparams, ctx->kv_self, type_k, type_v,
- cparams.n_ctx, model->n_gpu_layers, cparams.offload_kqv)) {
+ if (!llama_kv_cache_init(ctx->kv_self, ctx->model, type_k, type_v,
+ cparams.n_ctx, cparams.offload_kqv)) {
LLAMA_LOG_ERROR("%s: llama_kv_cache_init() failed for self-attention cache\n", __func__);
llama_free(ctx);
return nullptr;
}
{
- // the compute buffer is used to store the tensor and graph structs, while the allocator buffer is used for the tensor data
+ // buffer types used for the compute buffer of each backend
+ std::vector<ggml_backend_buffer_type_t> backend_buft;
+ for (auto * backend : ctx->backends) {
+ if (ggml_backend_is_cpu(backend)) {
+ // use host buffers for the CPU backend compute buffer
+ backend_buft.push_back(llama_default_buffer_type_cpu(true));
+ } else {
+ backend_buft.push_back(ggml_backend_get_default_buffer_type(backend));
+ }
+ }
+
+ // buffer used to store the computation graph and the tensor meta data
ctx->buf_compute_meta.resize(ggml_tensor_overhead()*LLAMA_MAX_NODES + ggml_graph_overhead());
- // create measure allocator
- ctx->alloc = ggml_allocr_new_measure_from_backend(ctx->backend);
+ ctx->sched = ggml_backend_sched_new(ctx->backends.data(), backend_buft.data(), ctx->backends.size(), LLAMA_MAX_NODES);
+ ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
// build worst-case graph
int n_tokens = (int)std::min(cparams.n_ctx, cparams.n_batch);
llama_token token = llama_token_bos(&ctx->model); // not actually used by llama_build_graph, but required to choose between token and embedding inputs graph
ggml_cgraph * gf = llama_build_graph(*ctx, llama_batch_get_one(&token, n_tokens, n_past, 0));
- // measure memory requirements for the graph
- size_t alloc_size = ggml_allocr_alloc_graph(ctx->alloc, gf);
-
- LLAMA_LOG_INFO("%s: compute buffer total size = %.2f MiB\n", __func__, (ctx->buf_compute_meta.size() + alloc_size) / 1024.0 / 1024.0);
-
- // create allocator again with exact memory requirements
- ggml_allocr_free(ctx->alloc);
-
- ctx->buf_alloc = ggml_backend_alloc_buffer(ctx->backend, alloc_size);
- ctx->alloc = ggml_allocr_new_from_buffer(ctx->buf_alloc);
-#if defined(GGML_USE_CUBLAS) && !defined(LLAMA_GGML_BACKEND_CUDA_TEST)
- if (model->n_gpu_layers > 0) {
- // the CPU buffer adds this padding in case the malloc buffer is not aligned, so we need to do the same for the GPU buffer, since we use the same offsets
- ggml_cuda_set_scratch_size(alloc_size + 64);
- LLAMA_LOG_INFO("%s: VRAM scratch buffer: %.2f MiB\n", __func__, alloc_size / 1024.0 / 1024.0);
-
- // calculate total VRAM usage
- auto add_tensor = [](const ggml_tensor * t, size_t & size) {
- if (t->backend == GGML_BACKEND_GPU || t->backend == GGML_BACKEND_GPU_SPLIT) {
- size += ggml_nbytes(t);
- }
- };
- size_t model_vram_size = 0;
- for (const auto & kv : model->tensors_by_name) {
- add_tensor(kv.second, model_vram_size);
- }
-
- size_t kv_vram_size = 0;
- for (auto & k : ctx->kv_self.k_l) {
- add_tensor(k, kv_vram_size);
- }
- for (auto & v : ctx->kv_self.v_l) {
- add_tensor(v, kv_vram_size);
- }
-
- size_t ctx_vram_size = alloc_size + kv_vram_size;
- size_t total_vram_size = model_vram_size + ctx_vram_size;
+ // initialize scheduler with the worst-case graph
+ ggml_backend_sched_init_measure(ctx->sched, gf);
+ // note: the number of splits during measure is higher than during inference due to the kv shift
+ int n_splits = ggml_backend_sched_get_n_splits(ctx->sched);
+ LLAMA_LOG_INFO("%s: graph splits (measure): %d\n", __func__, n_splits);
+ ctx->alloc = ggml_backend_sched_get_tallocr(ctx->sched, ctx->backend_cpu);
- LLAMA_LOG_INFO("%s: total VRAM used: %.2f MiB (model: %.2f MiB, context: %.2f MiB)\n", __func__,
- total_vram_size / 1024.0 / 1024.0,
- model_vram_size / 1024.0 / 1024.0,
- ctx_vram_size / 1024.0 / 1024.0);
+ for (ggml_backend_t backend : ctx->backends) {
+ ggml_backend_buffer_t buf = ggml_backend_sched_get_buffer(ctx->sched, backend);
+ LLAMA_LOG_INFO("%s: %10s compute buffer size = %8.2f MiB\n", __func__,
+ ggml_backend_buffer_name(buf),
+ ggml_backend_buffer_get_size(buf) / 1024.0 / 1024.0);
}
-#endif
}
}
}
int32_t llama_model_desc(const struct llama_model * model, char * buf, size_t buf_size) {
- return snprintf(buf, buf_size, "%s %s%s %s",
+ return snprintf(buf, buf_size, "%s %s %s",
llama_model_arch_name(model->arch).c_str(),
- model->hparams.n_expert > 0 ? (std::to_string(model->hparams.n_expert) + "x").c_str() : "",
llama_model_type_name(model->type),
llama_model_ftype_name(model->ftype).c_str());
}
}
struct ggml_tensor * llama_get_model_tensor(struct llama_model * model, const char * name) {
- return ggml_get_tensor(model->ctx, name);
+ auto it = std::find_if(model->tensors_by_name.begin(), model->tensors_by_name.end(),
+ [name](const std::pair<std::string, struct ggml_tensor *> & it) {
+ return it.first == name;
+ });
+ if (it == model->tensors_by_name.end()) {
+ return nullptr;
+ }
+ return it->second;
}
uint32_t llama_model_quantize(
const size_t s_embedding = ctx->embedding.size() * sizeof(float);
const size_t s_kv_size = sizeof(size_t);
const size_t s_kv_ntok = sizeof(int);
- const size_t s_kv = ggml_backend_buffer_get_size(ctx->kv_self.buf);
+ const size_t s_kv = ctx->kv_self.total_size();
const size_t s_total = (
+ s_rng_size
const auto n_embd_v_gqa = hparams.n_embd_v_gqa();
const auto n_ctx = cparams.n_ctx;
- const size_t kv_buf_size = ggml_backend_buffer_get_size(kv_self.buf);
+ const size_t kv_buf_size = kv_self.total_size();
const uint32_t kv_head = kv_self.head;
const uint32_t kv_size = kv_self.size;
const uint32_t kv_used = kv_self.used;
if (kv_buf_size) {
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
- ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
- ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
-
- std::vector<struct ggml_tensor *> kout2d(n_layer);
- std::vector<struct ggml_tensor *> vout2d(n_layer);
-
- for (int il = 0; il < (int) n_layer; ++il) {
- kout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
- vout2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
-
- ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
- n_embd_k_gqa, kv_head,
- elt_size*n_embd_k_gqa, 0);
-
- ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
- kv_head, n_embd_v_gqa,
- elt_size*n_ctx, 0);
-
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, k2d, kout2d[il]));
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, v2d, vout2d[il]));
- }
-
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
-
- ggml_backend_graph_compute(ctx->backend, gf);
-
std::vector<uint8_t> tmp_buf;
for (int il = 0; il < (int) n_layer; ++il) {
- tmp_buf.resize(ggml_nbytes(kout2d[il]));
- ggml_backend_tensor_get(kout2d[il], tmp_buf.data(), 0, tmp_buf.size());
+ tmp_buf.resize(elt_size*n_embd_k_gqa*kv_head);
+ ggml_backend_tensor_get(kv_self.k_l[il], tmp_buf.data(), 0, tmp_buf.size());
data_ctx->write(tmp_buf.data(), tmp_buf.size());
- tmp_buf.resize(ggml_nbytes(vout2d[il]));
- ggml_backend_tensor_get(vout2d[il], tmp_buf.data(), 0, tmp_buf.size());
- data_ctx->write(tmp_buf.data(), tmp_buf.size());
+ // v is not contiguous, copy row by row
+ tmp_buf.resize(elt_size*kv_head);
+ for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
+ ggml_backend_tensor_get(kv_self.v_l[il], tmp_buf.data(), ir*elt_size*n_ctx, tmp_buf.size());
+ data_ctx->write(tmp_buf.data(), tmp_buf.size());
+ }
}
-
- ggml_free(cpy_ctx);
-
- ggml_backend_buffer_free(buf);
}
for (uint32_t i = 0; i < kv_size; ++i) {
memcpy(&kv_used, inp, sizeof(kv_used)); inp += sizeof(kv_used);
if (kv_buf_size) {
- GGML_ASSERT(ggml_backend_buffer_get_size(kv_self.buf) == kv_buf_size);
+ GGML_ASSERT(kv_self.total_size() == kv_buf_size);
const size_t elt_size = ggml_element_size(kv_self.k_l[0]);
- ggml_context * cpy_ctx = ggml_init({ 6*n_layer*ggml_tensor_overhead() + ggml_graph_overhead(), NULL, /* no_alloc */ true });
- ggml_cgraph * gf = ggml_new_graph(cpy_ctx);
-
- std::vector<struct ggml_tensor *> kin2d(n_layer);
- std::vector<struct ggml_tensor *> vin2d(n_layer);
-
- for (int il = 0; il < n_layer; ++il) {
- kin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.k_l[il]->type, n_embd_k_gqa, kv_head);
- vin2d[il] = ggml_new_tensor_2d(cpy_ctx, kv_self.v_l[il]->type, kv_head, n_embd_v_gqa);
-
- ggml_tensor * k2d = ggml_view_2d(cpy_ctx, kv_self.k_l[il],
- n_embd_k_gqa, kv_head,
- elt_size*n_embd_k_gqa, 0);
-
- ggml_tensor * v2d = ggml_view_2d(cpy_ctx, kv_self.v_l[il],
- kv_head, n_embd_v_gqa,
- elt_size*n_ctx, 0);
-
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, kin2d[il], k2d));
- ggml_build_forward_expand(gf, ggml_cpy(cpy_ctx, vin2d[il], v2d));
- }
-
- ggml_backend_buffer_t buf = ggml_backend_alloc_ctx_tensors(cpy_ctx, ctx->backend);
-
- // load data into the tensors
- for (int il = 0; il < n_layer; ++il) {
- ggml_backend_tensor_set(kin2d[il], inp, 0, ggml_nbytes(kin2d[il]));
- inp += ggml_nbytes(kin2d[il]);
-
- ggml_backend_tensor_set(vin2d[il], inp, 0, ggml_nbytes(vin2d[il]));
- inp += ggml_nbytes(vin2d[il]);
+ for (int il = 0; il < (int) n_layer; ++il) {
+ size_t k_size = elt_size*n_embd_k_gqa*kv_head;
+ ggml_backend_tensor_set(kv_self.k_l[il], inp, 0, k_size);
+ inp += k_size;
+
+ // v is not contiguous, copy row by row
+ size_t v_row_size = elt_size*kv_head;
+ for (int ir = 0; ir < (int) n_embd_v_gqa; ++ir) {
+ ggml_backend_tensor_set(kv_self.v_l[il], inp, ir*elt_size*n_ctx, v_row_size);
+ inp += v_row_size;
+ }
}
-
- ggml_backend_graph_compute(ctx->backend, gf);
-
- ggml_free(cpy_ctx);
-
- ggml_backend_buffer_free(buf);
}
ctx->kv_self.head = kv_head;