--- /dev/null
+#include "ggml-blas.h"
+#include "ggml-backend-impl.h"
+
+#include <future>
+#include <vector>
+
+#if defined(GGML_USE_ACCELERATE)
+# include <Accelerate/Accelerate.h>
+#elif defined(GGML_BLAS_USE_MKL)
+# include <mkl.h>
+#else
+# include <cblas.h>
+# ifdef BLIS_ENABLE_CBLAS
+# include <blis.h>
+# endif
+#endif
+
+struct ggml_backend_blas_context {
+ int n_threads = GGML_DEFAULT_N_THREADS;
+ std::unique_ptr<char[]> work_data;
+ size_t work_size = 0;
+#ifndef GGML_USE_OPENMP
+ std::vector<std::future<void>> tasks;
+#endif
+};
+
+// helper function to determine if it is better to use BLAS or not
+// for large matrices, BLAS is faster
+static bool ggml_backend_blas_use_blas(const struct ggml_tensor * dst) {
+ const struct ggml_tensor * src0 = dst->src[0];
+ const struct ggml_tensor * src1 = dst->src[1];
+
+ const int64_t ne10 = src1->ne[0];
+
+ const int64_t ne0 = dst->ne[0];
+ const int64_t ne1 = dst->ne[1];
+
+ // TODO: find the optimal values for these
+ if (ggml_is_contiguous(src0) &&
+ ggml_is_contiguous(src1) &&
+ src1->type == GGML_TYPE_F32 &&
+ (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
+
+ /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
+ return true;
+ }
+
+ return false;
+}
+
+static void ggml_backend_blas_mul_mat(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
+ const struct ggml_tensor * src0 = dst->src[0];
+ const struct ggml_tensor * src1 = dst->src[1];
+
+ GGML_TENSOR_BINARY_OP_LOCALS
+
+ const enum ggml_type type = src0->type;
+
+ GGML_ASSERT(ne0 == ne01);
+ GGML_ASSERT(ne1 == ne11);
+ GGML_ASSERT(ne2 == ne12);
+ GGML_ASSERT(ne3 == ne13);
+
+ // we don't support permuted src0 or src1
+ GGML_ASSERT(nb00 == ggml_type_size(type));
+ GGML_ASSERT(nb10 == ggml_type_size(src1->type));
+
+ // dst cannot be transposed or permuted
+ GGML_ASSERT(nb0 == sizeof(float));
+ GGML_ASSERT(nb0 <= nb1);
+ GGML_ASSERT(nb1 <= nb2);
+ GGML_ASSERT(nb2 <= nb3);
+
+ // broadcast factors
+ const int64_t r2 = ne12/ne02;
+ const int64_t r3 = ne13/ne03;
+
+ const int64_t ne_plane = ne01*ne00;
+ const size_t desired_wsize = type == GGML_TYPE_F32 ? 0 : ne03*ne02*ne_plane*sizeof(float);
+
+ if (ctx->work_size < desired_wsize) {
+ ctx->work_data.reset(new char[desired_wsize]);
+ ctx->work_size = desired_wsize;
+ }
+ void * wdata = ctx->work_data.get();
+
+ // convert src0 to float
+ if (type != GGML_TYPE_F32) {
+ ggml_type_traits_t type_traits = ggml_internal_get_type_traits(type);
+ ggml_to_float_t const to_float = type_traits.to_float;
+
+ for (int64_t i03 = 0; i03 < ne03; i03++) {
+ for (int64_t i02 = 0; i02 < ne02; i02++) {
+ const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
+ float * const wplane = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane;
+
+ const int min_cols_per_thread = 4096;
+ const int min_rows_per_thread = std::max((int)(min_cols_per_thread/ne00), 1);
+ const int n_threads = std::max(std::min(ctx->n_threads, (int)(ne01/min_rows_per_thread)), 1);
+
+#ifdef GGML_USE_OPENMP
+ #pragma omp parallel for num_threads(n_threads)
+ for (int64_t i01 = 0; i01 < ne01; i01++) {
+ to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+ }
+#else
+ for (int i = 1; i < n_threads; i++) {
+ const int64_t start = i*ne01/n_threads;
+ const int64_t end = (i + 1)*ne01/n_threads;
+ if (start < end) {
+ ctx->tasks.push_back(std::async(std::launch::async, [=]() {
+ for (int64_t i01 = start; i01 < end; i01++) {
+ to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+ }
+ }));
+ }
+ }
+ {
+ // reuse the current thread for the first task
+ const int64_t start = 0;
+ const int64_t end = ne01/n_threads;
+ for (int64_t i01 = start; i01 < end; i01++) {
+ to_float((const char *) x + i01*nb01, wplane + i01*ne00, ne00);
+ }
+ }
+#endif
+ }
+ }
+
+#ifndef GGML_USE_OPENMP
+ // wait for all tasks to finish
+ for (auto & task : ctx->tasks) {
+ task.get();
+ }
+ ctx->tasks.clear();
+#endif
+ }
+
+#if defined(OPENBLAS_VERSION)
+ openblas_set_num_threads(ctx->n_threads);
+#endif
+
+#if defined(BLIS_ENABLE_CBLAS)
+ bli_thread_set_num_threads(ctx->n_threads);
+#endif
+
+ for (int64_t i13 = 0; i13 < ne13; i13++) {
+ for (int64_t i12 = 0; i12 < ne12; i12++) {
+ const int64_t i03 = i13/r3;
+ const int64_t i02 = i12/r2;
+
+ const float * x = (float *) ((char *) src0->data + i02*nb02 + i03*nb03);
+ const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
+ float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
+
+ if (type != GGML_TYPE_F32) {
+ x = (float *) wdata + i02*ne_plane + i03*ne02*ne_plane;
+ }
+
+ cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
+ ne1, ne01, ne10,
+ 1.0f, y, ne10,
+ x, ne00,
+ 0.0f, d, ne01);
+ }
+ }
+}
+
+static void ggml_backend_blas_out_prod(ggml_backend_blas_context * ctx, struct ggml_tensor * dst) {
+ const struct ggml_tensor * src0 = dst->src[0];
+ const struct ggml_tensor * src1 = dst->src[1];
+
+ GGML_TENSOR_BINARY_OP_LOCALS
+
+ GGML_ASSERT(ne0 == ne00);
+ GGML_ASSERT(ne1 == ne10);
+ GGML_ASSERT(ne2 == ne02);
+ GGML_ASSERT(ne02 == ne12);
+ GGML_ASSERT(ne3 == ne13);
+ GGML_ASSERT(ne03 == ne13);
+
+ // we don't support permuted src0 or src1
+ GGML_ASSERT(nb00 == sizeof(float));
+
+ // dst cannot be transposed or permuted
+ GGML_ASSERT(nb0 == sizeof(float));
+ // GGML_ASSERT(nb0 <= nb1);
+ // GGML_ASSERT(nb1 <= nb2);
+ // GGML_ASSERT(nb2 <= nb3);
+
+ // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
+ // src0: (k,n)
+ // src1: (k,m)
+ // dst: (m,n)
+ //
+ // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
+ // Also expressed as (major,minor)
+ // a: (m,k): so src1 transposed
+ // b: (k,n): so src0
+ // c: (m,n)
+ //
+ // However, if ggml_is_transposed(src1) is true, then
+ // src1->data already contains a transposed version, so sgemm mustn't
+ // transpose it further.
+
+ int n = src0->ne[0];
+ int k = src0->ne[1];
+ int m = src1->ne[0];
+
+ CBLAS_TRANSPOSE transposeA;
+ int lda;
+
+ if (!ggml_is_transposed(src1)) {
+ transposeA = CblasTrans;
+ lda = m;
+ } else {
+ transposeA = CblasNoTrans;
+ lda = k;
+ }
+
+ float * a = (float *) ((char *) src1->data);
+ float * b = (float *) ((char *) src0->data);
+ float * c = (float *) ((char *) dst->data);
+
+ cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
+
+ GGML_UNUSED(ctx);
+}
+
+// backend interface
+
+GGML_CALL static const char * ggml_backend_blas_name(ggml_backend_t backend) {
+ return "BLAS";
+
+ GGML_UNUSED(backend);
+}
+
+GGML_CALL static void ggml_backend_blas_free(ggml_backend_t backend) {
+ ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
+ delete ctx;
+ delete backend;
+}
+
+GGML_CALL static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
+ return ggml_backend_cpu_buffer_type();
+
+ GGML_UNUSED(backend);
+}
+
+GGML_CALL static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+ ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
+
+ for (int i = 0; i < cgraph->n_nodes; i++) {
+ struct ggml_tensor * node = cgraph->nodes[i];
+
+ switch (node->op) {
+ case GGML_OP_MUL_MAT:
+ ggml_backend_blas_mul_mat(ctx, node);
+ break;
+
+ case GGML_OP_OUT_PROD:
+ ggml_backend_blas_out_prod(ctx, node);
+ break;
+
+ case GGML_OP_NONE:
+ case GGML_OP_RESHAPE:
+ case GGML_OP_VIEW:
+ case GGML_OP_PERMUTE:
+ case GGML_OP_TRANSPOSE:
+ break;
+
+ default:
+ fprintf(stderr, "%s: unsupported op %s\n", __func__, ggml_op_desc(node));
+ GGML_ASSERT(false);
+ }
+ }
+
+ return GGML_STATUS_SUCCESS;
+
+ GGML_UNUSED(backend);
+}
+
+GGML_CALL static bool ggml_backend_blas_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+ const struct ggml_tensor * src0 = op->src[0];
+ const struct ggml_tensor * src1 = op->src[1];
+
+ return (op->op == GGML_OP_MUL_MAT && ggml_backend_blas_use_blas(op)) ||
+ (op->op == GGML_OP_OUT_PROD && op->src[0]->type == GGML_TYPE_F32 &&
+ op->src[1]->type == GGML_TYPE_F32 &&
+ ggml_is_matrix(src0) &&
+ ggml_is_matrix(src1) &&
+ ggml_is_contiguous(src0) &&
+ (ggml_is_contiguous(src1) || ggml_is_transposed(src1)));
+
+ GGML_UNUSED(backend);
+}
+
+GGML_CALL static bool ggml_backend_blas_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+ return ggml_backend_buft_is_host(buft);
+
+ GGML_UNUSED(backend);
+}
+
+static struct ggml_backend_i blas_backend_i = {
+ /* .get_name = */ ggml_backend_blas_name,
+ /* .free = */ ggml_backend_blas_free,
+ /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
+ /* .set_tensor_async = */ NULL,
+ /* .get_tensor_async = */ NULL,
+ /* .cpy_tensor_async = */ NULL,
+ /* .synchronize = */ NULL,
+ /* .graph_plan_create = */ NULL,
+ /* .graph_plan_free = */ NULL,
+ /* .graph_plan_update = */ NULL,
+ /* .graph_plan_compute = */ NULL,
+ /* .graph_compute = */ ggml_backend_blas_graph_compute,
+ /* .supports_op = */ ggml_backend_blas_supports_op,
+ /* .supports_buft = */ ggml_backend_blas_supports_buft,
+ /* .offload_op = */ NULL,
+ /* .event_new = */ NULL,
+ /* .event_free = */ NULL,
+ /* .event_record = */ NULL,
+ /* .event_wait = */ NULL,
+ /* .event_synchronize = */ NULL,
+};
+
+static ggml_guid_t ggml_backend_blas_guid(void) {
+ static ggml_guid guid = { 0x12, 0xa8, 0xae, 0xf4, 0xc0, 0x1e, 0x61, 0x97, 0x8f, 0xeb, 0x33, 0x04, 0xa1, 0x33, 0x51, 0x2d };
+ return &guid;
+}
+
+ggml_backend_t ggml_backend_blas_init(void) {
+ ggml_backend_blas_context * ctx = new ggml_backend_blas_context;
+
+ ggml_backend_t backend = new ggml_backend {
+ /* .guid = */ ggml_backend_blas_guid(),
+ /* .interface = */ blas_backend_i,
+ /* .context = */ ctx,
+ };
+
+#if !defined(NDEBUG) && defined(OPENBLAS_VERSION) && defined(GGML_USE_OPENMP)
+ if (openblas_get_parallel() != OPENBLAS_OPENMP) {
+ fprintf(stderr, "%s: warning: ggml is using OpenMP, but OpenBLAS was compiled without OpenMP support\n", __func__);
+ }
+#endif
+
+#if !defined(NDEBUG) && defined(BLIS_ENABLE_CBLAS) && defined(GGML_USE_OPENMP) && !defined(BLIS_ENABLE_OPENMP)
+ fprintf(stderr, "%s: warning: ggml is using OpenMP, but BLIS was compiled without OpenMP support\n", __func__);
+#endif
+
+ return backend;
+}
+
+GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend) {
+ return backend != NULL && ggml_guid_matches(backend->guid, ggml_backend_blas_guid());
+}
+
+void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads) {
+ GGML_ASSERT(ggml_backend_is_blas(backend_blas));
+
+ ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend_blas->context;
+ ctx->n_threads = n_threads;
+}
--- /dev/null
+#pragma once
+
+#include "ggml.h"
+#include "ggml-backend.h"
+
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+// backend API
+GGML_API GGML_CALL ggml_backend_t ggml_backend_blas_init(void);
+
+GGML_API GGML_CALL bool ggml_backend_is_blas(ggml_backend_t backend);
+
+// number of threads used for conversion to float
+// for openblas and blis, this will also set the number of threads used for blas operations
+GGML_API GGML_CALL void ggml_backend_blas_set_n_threads(ggml_backend_t backend_blas, int n_threads);
+
+
+#ifdef __cplusplus
+}
+#endif
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
- GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
// buffer
GGML_API enum ggml_status ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API enum ggml_status ggml_backend_graph_compute_async(ggml_backend_t backend, struct ggml_cgraph * cgraph);
GGML_API bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op);
+ GGML_API bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
GGML_API bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op);
// tensor copy between different backends
GGML_API void ggml_backend_event_free (ggml_backend_event_t event);
GGML_API void ggml_backend_event_record (ggml_backend_event_t event);
GGML_API void ggml_backend_event_synchronize(ggml_backend_event_t event);
- GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event); // wait async on event
+ GGML_API void ggml_backend_event_wait (ggml_backend_t backend, ggml_backend_event_t event);
//
// CPU backend
GGML_API size_t ggml_backend_reg_get_count(void);
GGML_API size_t ggml_backend_reg_find_by_name(const char * name);
- GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is name[:params]
+ GGML_API ggml_backend_t ggml_backend_reg_init_backend_from_str(const char * backend_str); // str is backend_name:params (params is optional)
GGML_API const char * ggml_backend_reg_get_name(size_t i);
GGML_API ggml_backend_t ggml_backend_reg_init_backend(size_t i, const char * params); // params is backend-specific
GGML_API ggml_backend_buffer_type_t ggml_backend_reg_get_default_buffer_type(size_t i);
};
struct tensor_alloc {
+ int buffer_id;
size_t offset;
size_t size_max; // 0 = pre-allocated, unused, or view
};
};
struct node_alloc {
- int buffer_id;
struct tensor_alloc dst;
struct tensor_alloc src[GGML_MAX_SRC];
};
for (int i = 0; i < n_bufs; i++) {
galloc->bufts[i] = bufts[i];
galloc->buffers[i] = NULL;
- size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
- galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
+
+ // check if the same buffer type is used multiple times and reuse the same allocator
+ for (int j = 0; j < i; j++) {
+ if (bufts[i] == bufts[j]) {
+ galloc->buf_tallocs[i] = galloc->buf_tallocs[j];
+ break;
+ }
+ }
+
+ if (galloc->buf_tallocs[i] == NULL) {
+ size_t alignment = ggml_backend_buft_get_alignment(bufts[i]);
+ galloc->buf_tallocs[i] = ggml_dyn_tallocr_new(alignment);
+ }
}
galloc->n_buffers = n_bufs;
for (int i = 0; i < galloc->n_buffers; i++) {
if (galloc->buffers != NULL) {
- ggml_backend_buffer_free(galloc->buffers[i]);
+ // skip if already freed
+ bool freed = false;
+ for (int j = 0; j < i; j++) {
+ if (galloc->buffers[j] == galloc->buffers[i]) {
+ freed = true;
+ break;
+ }
+ }
+ if (!freed) {
+ ggml_backend_buffer_free(galloc->buffers[i]);
+ }
}
if (galloc->buf_tallocs != NULL) {
- ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
+ // skip if already freed
+ bool freed = false;
+ for (int j = 0; j < i; j++) {
+ if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
+ freed = true;
+ break;
+ }
+ }
+ if (!freed) {
+ ggml_dyn_tallocr_free(galloc->buf_tallocs[i]);
+ }
}
}
}
}
-static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node, int buffer_id) {
+static void ggml_gallocr_free_node(ggml_gallocr_t galloc, struct ggml_tensor * node) {
// graph outputs are never freed
if (node->flags & GGML_TENSOR_FLAG_OUTPUT) {
AT_PRINTF("not freeing output %s\n", node->name);
return;
}
- struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
- ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
size_t offset = hn->offset;
+ int buffer_id = hn->buffer_id;
+ struct ggml_dyn_tallocr * alloc = galloc->buf_tallocs[buffer_id];
+ ggml_backend_buffer_type_t buft = galloc->bufts[buffer_id];
size_t size = ggml_backend_buft_get_alloc_size(buft, node);
ggml_dyn_tallocr_free_tensor(alloc, offset, size, node);
hn->allocated = false;
AT_PRINTF("view_src %s: %d children, %d views\n",
view_src->name, view_src_hn->n_children, view_src_hn->n_views);
if (view_src_hn->n_views == 0 && view_src_hn->n_children == 0 && view_src_hn->allocated) {
- ggml_gallocr_free_node(galloc, view_src, buffer_id);
+ ggml_gallocr_free_node(galloc, view_src);
}
}
else if (p_hn->allocated) {
- ggml_gallocr_free_node(galloc, parent, buffer_id);
+ ggml_gallocr_free_node(galloc, parent);
}
}
AT_PRINTF("\n");
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
struct node_alloc * node_alloc = &galloc->node_allocs[i];
- node_alloc->buffer_id = get_node_buffer_id(node_buffer_ids, i);
if (node->view_src || node->data) {
+ node_alloc->dst.buffer_id = -1;
node_alloc->dst.offset = SIZE_MAX;
node_alloc->dst.size_max = 0;
} else {
struct hash_node * hn = ggml_gallocr_hash_get(galloc, node);
- node_alloc->dst.offset = hn->offset;
- node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
+ node_alloc->dst.buffer_id = hn->buffer_id;
+ node_alloc->dst.offset = hn->offset;
+ node_alloc->dst.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], node);
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (!src || src->view_src || src->data) {
+ node_alloc->src[j].buffer_id = -1;
node_alloc->src[j].offset = SIZE_MAX;
node_alloc->src[j].size_max = 0;
} else {
struct hash_node * hn = ggml_gallocr_hash_get(galloc, src);
+ node_alloc->src[j].buffer_id = hn->buffer_id;
node_alloc->src[j].offset = hn->offset;
node_alloc->src[j].size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], src);
}
struct hash_node * hn = ggml_gallocr_hash_get(galloc, leaf);
galloc->leaf_allocs[i].buffer_id = hn->buffer_id;
if (leaf->view_src || leaf->data) {
+ galloc->leaf_allocs[i].leaf.buffer_id = -1;
galloc->leaf_allocs[i].leaf.offset = SIZE_MAX;
galloc->leaf_allocs[i].leaf.size_max = 0;
} else {
+ galloc->leaf_allocs[i].leaf.buffer_id = hn->buffer_id;
galloc->leaf_allocs[i].leaf.offset = hn->offset;
galloc->leaf_allocs[i].leaf.size_max = ggml_backend_buft_get_alloc_size(galloc->bufts[hn->buffer_id], leaf);
}
// reallocate buffers if needed
for (int i = 0; i < galloc->n_buffers; i++) {
+ // if the buffer type is used multiple times, we reuse the same buffer
+ for (int j = 0; j < i; j++) {
+ if (galloc->buf_tallocs[j] == galloc->buf_tallocs[i]) {
+ galloc->buffers[i] = galloc->buffers[j];
+ break;
+ }
+ }
+
size_t cur_size = galloc->buffers[i] ? ggml_backend_buffer_get_size(galloc->buffers[i]) : 0;
size_t new_size = ggml_dyn_tallocr_max_size(galloc->buf_tallocs[i]);
#ifndef NDEBUG
fprintf(stderr, "%s: reallocating %s buffer from size %.02f MiB to %.02f MiB\n", __func__, ggml_backend_buft_name(galloc->bufts[i]), cur_size / 1024.0 / 1024.0, new_size / 1024.0 / 1024.0);
#endif
+
ggml_backend_buffer_free(galloc->buffers[i]);
galloc->buffers[i] = ggml_backend_buft_alloc_buffer(galloc->bufts[i], new_size);
if (galloc->buffers[i] == NULL) {
return ggml_gallocr_reserve_n(galloc, graph, NULL, NULL);
}
-static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, int buffer_id, struct tensor_alloc * tensor_alloc) {
+static void ggml_gallocr_init_tensor(ggml_gallocr_t galloc, struct ggml_tensor * tensor, struct tensor_alloc * tensor_alloc) {
+ int buffer_id = tensor_alloc->buffer_id;
assert(tensor->data || tensor->view_src || ggml_backend_buffer_get_alloc_size(galloc->buffers[buffer_id], tensor) <= tensor_alloc->size_max);
if (tensor->view_src != NULL) {
}
}
-static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct node_alloc * nalloc, struct tensor_alloc * talloc) {
- ggml_backend_buffer_type_t buft = galloc->bufts[nalloc->buffer_id];
+static bool ggml_gallocr_node_needs_realloc(ggml_gallocr_t galloc, struct ggml_tensor * node, struct tensor_alloc * talloc) {
+ ggml_backend_buffer_type_t buft = talloc->buffer_id != -1 ? galloc->bufts[talloc->buffer_id] : NULL;
size_t node_size = (node->data || node->view_src) ? 0 : ggml_backend_buft_get_alloc_size(buft, node);
return talloc->size_max >= node_size;
}
struct ggml_tensor * node = graph->nodes[i];
struct node_alloc * node_alloc = &galloc->node_allocs[i];
- if (!ggml_gallocr_node_needs_realloc(galloc, node, node_alloc, &node_alloc->dst)) {
+ if (!ggml_gallocr_node_needs_realloc(galloc, node, &node_alloc->dst)) {
#ifndef NDEBUG
fprintf(stderr, "%s: node %s is not valid\n", __func__, node->name);
#endif
if (src == NULL) {
continue;
}
- if (!ggml_gallocr_node_needs_realloc(galloc, src, node_alloc, &node_alloc->src[j])) {
+ if (!ggml_gallocr_node_needs_realloc(galloc, src, &node_alloc->src[j])) {
#ifndef NDEBUG
fprintf(stderr, "%s: src %d (%s) of node %s is not valid\n", __func__, j, src->name, node->name);
#endif
for (int i = 0; i < graph->n_leafs; i++) {
struct ggml_tensor * leaf = graph->leafs[i];
struct leaf_alloc * leaf_alloc = &galloc->leaf_allocs[i];
- ggml_gallocr_init_tensor(galloc, leaf, leaf_alloc->buffer_id, &leaf_alloc->leaf);
+ ggml_gallocr_init_tensor(galloc, leaf, &leaf_alloc->leaf);
}
// nodes
for (int i = 0; i < graph->n_nodes; i++) {
if (src == NULL) {
continue;
}
- ggml_gallocr_init_tensor(galloc, src, node_alloc->buffer_id, &node_alloc->src[j]);
+ ggml_gallocr_init_tensor(galloc, src, &node_alloc->src[j]);
}
- ggml_gallocr_init_tensor(galloc, node, node_alloc->buffer_id, &node_alloc->dst);
+ ggml_gallocr_init_tensor(galloc, node, &node_alloc->dst);
}
return true;
if (galloc->buffers[buffer_id] == NULL) {
return 0;
}
+
+ for (int i = 0; i < buffer_id; i++) {
+ if (galloc->buffers[i] == galloc->buffers[buffer_id]) {
+ // this buffer is the same as a previous one due to the same buffer type being used multiple times
+ // only return the buffer size the first time it appears to avoid double counting
+ return 0;
+ }
+ }
+
return ggml_backend_buffer_get_size(galloc->buffers[buffer_id]);
}
struct ggml_backend_buffer_type_i {
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
+ // allocate a buffer of this type
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
- size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
- size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
- size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
- bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
+ // tensor alignment
+ size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft);
+ // max buffer size that can be allocated
+ size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft);
+ // data size needed to allocate the tensor, including padding
+ size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
// check if tensor data is in host memory
- // should be equivalent to supports_backend(buft, ggml_backend_cpu_init())
bool (*GGML_CALL is_host) (ggml_backend_buffer_type_t buft);
};
void (*GGML_CALL synchronize)(ggml_backend_t backend);
// compute graph with a plan (not used currently)
+ // create a new plan for a graph
ggml_backend_graph_plan_t (*GGML_CALL graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*GGML_CALL graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+ // update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
+ void (*GGML_CALL graph_plan_update) (ggml_backend_t backend, ggml_backend_graph_plan_t plan, const struct ggml_cgraph * cgraph);
+ // compute the graph with the plan
+ enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
- // compute graph with a plan
- enum ggml_status (*GGML_CALL graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// compute graph without a plan (async)
enum ggml_status (*GGML_CALL graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
- // check if the backend supports an operation
+ // check if the backend can compute an operation
bool (*GGML_CALL supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+ // check if the backend can use tensors allocated in a buffer type
+ bool (*GGML_CALL supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
+
// check if the backend wants to run an operation, even if the weights are allocated in a CPU buffer
// these should be expensive operations with large batch sizes that may benefit from running on this backend
// even if the weight has to be copied from the CPU temporarily
bool (*GGML_CALL offload_op)(ggml_backend_t backend, const struct ggml_tensor * op);
// (optional) event synchronization
+ // create a new event that can record events on this backend instance
ggml_backend_event_t (*GGML_CALL event_new) (ggml_backend_t backend);
void (*GGML_CALL event_free) (ggml_backend_event_t event);
+ // record an event on the backend instance that created it
void (*GGML_CALL event_record) (ggml_backend_event_t event);
+ // wait for an event on on a different backend instance
void (*GGML_CALL event_wait) (ggml_backend_t backend, ggml_backend_event_t event);
+ // block until an event is recorded
void (*GGML_CALL event_synchronize) (ggml_backend_event_t event);
};
return ggml_nbytes(tensor);
}
-bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
- return buft->iface.supports_backend(buft, backend);
-}
-
bool ggml_backend_buft_is_host(ggml_backend_buffer_type_t buft) {
if (buft->iface.is_host) {
return buft->iface.is_host(buft);
return backend->iface.supports_op(backend, op);
}
+bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+ return backend->iface.supports_buft(backend, buft);
+}
+
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
if (backend->iface.offload_op != NULL) {
return backend->iface.offload_op(backend, op);
GGML_UNUSED(buft);
}
-GGML_CALL static bool ggml_backend_cpu_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
- return ggml_backend_is_cpu(backend);
-
- GGML_UNUSED(buft);
-}
-
GGML_CALL static bool ggml_backend_cpu_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return true;
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
- /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
},
/* .context = */ NULL,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
- /* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
},
/* .context = */ NULL,
GGML_UNUSED(backend);
}
+GGML_CALL static bool ggml_backend_cpu_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+ return ggml_backend_buft_is_host(buft);
+
+ GGML_UNUSED(backend);
+}
+
static struct ggml_backend_i cpu_backend_i = {
/* .get_name = */ ggml_backend_cpu_name,
/* .free = */ ggml_backend_cpu_free,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
/* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
+ /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
/* .supports_op = */ ggml_backend_cpu_supports_op,
+ /* .supports_buft = */ ggml_backend_cpu_supports_buft,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
int * node_backend_ids; // [graph_size]
int * leaf_backend_ids; // [graph_size]
+ int * prev_node_backend_ids; // [graph_size]
+ int * prev_leaf_backend_ids; // [graph_size]
+
// copy of the graph with modified inputs
struct ggml_cgraph * graph;
ggml_backend_sched_eval_callback callback_eval;
void * callback_eval_user_data;
+ bool debug;
+
// align context_buffer to GGML_MEM_ALIGN
#ifdef _MSC_VER
__declspec(align(GGML_MEM_ALIGN))
return -1;
}
-static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor) {
+static int ggml_backend_sched_backend_from_buffer(ggml_backend_sched_t sched, const struct ggml_tensor * tensor, const struct ggml_tensor * op) {
ggml_backend_buffer_t buffer = tensor->buffer;
if (buffer == NULL) {
return -1;
}
- // find highest prio backend that supports the buffer type
+ // find highest prio backend that supports the buffer type and the op
for (int i = 0; i < sched->n_backends; i++) {
- if (ggml_backend_buft_supports_backend(buffer->buft, sched->backends[i])) {
+ if (ggml_backend_supports_buft(sched->backends[i], buffer->buft) &&
+ ggml_backend_supports_op(sched->backends[i], op)) {
return i;
}
}
- fprintf(stderr, "%s: error: no backend supports buffer type %s used in tensor %s\n",
- __func__, ggml_backend_buffer_name(buffer), tensor->name);
- GGML_ASSERT(false);
+#ifndef NDEBUG
+ fprintf(stderr, "%s: warning: no backend supports op %s with a weight with buffer type %s used in tensor %s, the weight will need to be copied\n",
+ __func__, ggml_op_desc(tensor), ggml_backend_buffer_name(buffer), tensor->name);
+#endif
return -1;
}
// TODO: use supports_op to check if the backend supports the op
// assign pre-allocated nodes to their backend
- int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor);
+ int cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor, tensor);
if (cur_backend_id != -1) {
SET_CAUSE(tensor, "1.dst");
return cur_backend_id;
// view_src
if (tensor->view_src != NULL) {
- cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src);
+ cur_backend_id = ggml_backend_sched_backend_from_buffer(sched, tensor->view_src, tensor);
if (cur_backend_id != -1) {
SET_CAUSE(tensor, "1.vsrc");
return cur_backend_id;
continue;
}
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
- int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src);
+ int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
// check if a backend with higher prio wants to offload the op
if (src_backend_id == sched->n_backends - 1) {
for (int b = 0; b < src_backend_id; b++) {
}
}
-//#define DEBUG_PASS1
-//#define DEBUG_PASS2
-//#define DEBUG_PASS3
-//#define DEBUG_PASS4
+static bool ggml_backend_sched_buffer_supported(ggml_backend_sched_t sched, struct ggml_tensor * t, int backend_id) {
+ ggml_backend_buffer_t buf = t->view_src ? t->view_src->buffer : t->buffer;
+ ggml_backend_buffer_type_t buft = NULL;
+
+ if (buf) {
+ // the tensor is already allocated
+ buft = buf->buft;
+ } else {
+ // see if the tensor already has a backend assigned, and use the buffer type of that backend
+ int tensor_backend_id = tensor_backend_id(t);
+ if (tensor_backend_id == -1 && t->view_src) {
+ tensor_backend_id = tensor_backend_id(t->view_src);
+ }
+ if (tensor_backend_id != -1) {
+ buft = sched->bufts[tensor_backend_id];
+ }
+ }
+
+ return buft != NULL && ggml_backend_supports_buft(sched->backends[backend_id], buft);
+}
+
+static void ggml_backend_sched_set_if_supported(ggml_backend_sched_t sched, struct ggml_tensor * node, int cur_backend_id, int * node_backend_id) {
+ if (ggml_backend_supports_op(sched->backends[cur_backend_id], node)) {
+ *node_backend_id = cur_backend_id;
+ SET_CAUSE(node, "2.sup");
+ }
+}
// assigns backends to ops and splits the graph into subgraphs that can be computed on the same backend
static void ggml_backend_sched_split_graph(ggml_backend_sched_t sched, struct ggml_cgraph * graph) {
}
}
}
-#ifdef DEBUG_PASS1
- fprintf(stderr, "PASS 1 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
// pass 2: expand current backend assignments
// assign the same backend to adjacent nodes
// expand gpu backends (i.e. non last prio) up and down, ignoring cpu (the lowest priority backend)
// thus, cpu will never be used unless weights are on cpu, or there are no gpu ops between cpu ops
-
-
- // pass 2.2 expand gpu down
+ // ops unsupported by the backend being expanded will be left unassigned so that they can be assigned later when the locations of its inputs are known
+ // expand gpu down
{
int cur_backend_id = -1;
for (int i = 0; i < graph->n_nodes; i++) {
} else {
cur_backend_id = *node_backend_id;
}
- } else {
- *node_backend_id = cur_backend_id;
- SET_CAUSE(node, "2.2");
+ } else if (cur_backend_id != -1) {
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
}
}
}
- // pass 2.1 expand gpu up
+ // expand gpu up
{
int cur_backend_id = -1;
for (int i = graph->n_nodes - 1; i >= 0; i--) {
} else {
cur_backend_id = *node_backend_id;
}
- } else {
- *node_backend_id = cur_backend_id;
- SET_CAUSE(node, "2.1");
+ } else if (cur_backend_id != -1) {
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
}
}
}
- // pass 2.4 expand rest down
+ // expand rest down
{
int cur_backend_id = -1;
for (int i = 0; i < graph->n_nodes; i++) {
int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id != -1) {
cur_backend_id = *node_backend_id;
- } else {
- *node_backend_id = cur_backend_id;
- SET_CAUSE(node, "2.4");
+ } else if (cur_backend_id != -1) {
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
}
}
}
- // pass 2.3 expand rest up
+ // expand rest up
{
int cur_backend_id = -1;
for (int i = graph->n_nodes - 1; i >= 0; i--) {
int * node_backend_id = &tensor_backend_id(node);
if (*node_backend_id != -1) {
cur_backend_id = *node_backend_id;
- } else {
- *node_backend_id = cur_backend_id;
- SET_CAUSE(node, "2.3");
+ } else if (cur_backend_id != -1) {
+ ggml_backend_sched_set_if_supported(sched, node, cur_backend_id, node_backend_id);
}
}
}
-#ifdef DEBUG_PASS2
- fprintf(stderr, "PASS 2 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
+ // pass 3: upgrade nodes to higher prio backends with compatible buffer types
+ // if the tensor is already in the same buffer type (*) as another higher priority backend, we should move it there
+ // however, we also need to verify that the sources are in compatible buffer types
+ // (*) the actual requirement is more relaxed, the buffer type of the backend should be supported by all the users of this tensor further down the graph
+ // however, this is slow to verify, so we have a more strict requirement that the buffer type is the same
+ // this is not uncommon since multiple backends can use host memory, with the same buffer type (eg. BLAS and CPU)
+ // additionally, set remaining unassigned nodes to the backend with the most supported inputs
+ // only nodes that could not be assigned during expansion due to the backend not supporting the op should be unassigned at this point
+ for (int i = 0; i < graph->n_nodes; i++) {
+ struct ggml_tensor * node = graph->nodes[i];
+ if (ggml_is_view_op(node->op)) {
+ continue;
+ }
+ int * node_backend_id = &tensor_backend_id(node);
+ if (*node_backend_id == -1) {
+ // unassigned node: find the backend with the most supported inputs
+ int n_supported_best = -1;
+ for (int b = 0; b < sched->n_backends; b++) {
+ if (ggml_backend_supports_op(sched->backends[b], node)) {
+ int n_supported = 0;
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ struct ggml_tensor * src = node->src[j];
+ if (src == NULL) {
+ continue;
+ }
+ if ((tensor_backend_id(src) != -1 || tensor_backend_id(src->view_src) != -1) && ggml_backend_sched_buffer_supported(sched, src, b)) {
+ n_supported++;
+ }
+ }
+ if (n_supported > n_supported_best) {
+ n_supported_best = n_supported;
+ *node_backend_id = b;
+ SET_CAUSE(node, "3.best");
+ }
+ }
+ }
+ } else {
+ // assigned node: upgrade to higher prio backend if possible
+ for (int b = 0; b < *node_backend_id; b++) {
+ if (sched->bufts[b] == sched->bufts[*node_backend_id] && ggml_backend_supports_op(sched->backends[b], node)) {
+ bool supported = true;
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ struct ggml_tensor * src = node->src[j];
+ if (src == NULL) {
+ continue;
+ }
+ if (!ggml_backend_sched_buffer_supported(sched, src, b)) {
+ supported = false;
+ break;
+ }
+ }
+ if (supported) {
+ *node_backend_id = b;
+ SET_CAUSE(node, "3.upg");
+ break;
+ }
+ }
+ }
+ }
+ }
- // pass 3: assign backends to remaining src from dst and view_src
+ // pass 4: assign backends to remaining src from dst and view_src
for (int i = 0; i < graph->n_nodes; i++) {
struct ggml_tensor * node = graph->nodes[i];
int * cur_backend_id = &tensor_backend_id(node);
if (node->view_src != NULL && *cur_backend_id == -1) {
*cur_backend_id = tensor_backend_id(node->view_src);
- SET_CAUSE(node, "3.vsrc");
+ SET_CAUSE(node, "4.vsrc");
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
struct ggml_tensor * src = node->src[j];
if (src->view_src != NULL) {
// views are always on the same backend as the source
*src_backend_id = tensor_backend_id(src->view_src);
- SET_CAUSE(src, "3.vsrc");
+ SET_CAUSE(src, "4.vsrc");
} else {
*src_backend_id = *cur_backend_id;
- SET_CAUSE(src, "3.cur");
+ SET_CAUSE(src, "4.cur");
}
}
}
}
-#ifdef DEBUG_PASS3
- fprintf(stderr, "PASS 3 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
// pass 4: split graph, find tensors that need to be copied
{
}
}
// check if the split has too many inputs
+ // FIXME: count the number of inputs instead of only checking when full
if (split->n_inputs == GGML_SCHED_MAX_SPLIT_INPUTS) {
const size_t id = hash_id(src);
int src_backend_id = sched->tensor_backend_id[id];
- if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL) {
+ bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
+ if (src_backend_id != cur_backend_id && sched->tensor_copies[hash_id(src)][cur_backend_id][0] == NULL && !supported) {
//printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
need_new_split = true;
break;
const int src_backend_id = tensor_backend_id(src);
assert(src_backend_id != -1); // all inputs should be assigned by now
- if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
+ if (src->flags & GGML_TENSOR_FLAG_INPUT && sched->n_copies > 1) {
size_t id = hash_id(src);
if (sched->tensor_copies[id][src_backend_id][0] == NULL) {
ggml_backend_t backend = sched->backends[src_backend_id];
}
}
- if (src_backend_id != node_backend_id) {
+ bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
+ if (src_backend_id != cur_backend_id && !supported) {
// create a copy of the input in the split's backend
const size_t id = hash_id(src);
if (sched->tensor_copies[id][cur_backend_id][0] == NULL) {
split->i_end = graph->n_nodes;
sched->n_splits = i_split + 1;
}
-#ifdef DEBUG_PASS4
- fprintf(stderr, "PASS 4 ASSIGNMENTS\n"); ggml_backend_sched_print_assignments(sched, graph);
-#endif
+
+ if (sched->debug) {
+ ggml_backend_sched_print_assignments(sched, graph);
+ }
+
+ // swap node_backend_ids and leaf_backend_ids and prevs
+ {
+ int * tmp = sched->node_backend_ids;
+ sched->node_backend_ids = sched->prev_node_backend_ids;
+ sched->prev_node_backend_ids = tmp;
+
+ tmp = sched->leaf_backend_ids;
+ sched->leaf_backend_ids = sched->prev_leaf_backend_ids;
+ sched->prev_leaf_backend_ids = tmp;
+ }
// create copies of the graph for each split
// TODO: avoid this copy
}
static bool ggml_backend_sched_alloc_splits(ggml_backend_sched_t sched) {
+ bool backend_ids_changed = false;
+ for (int i = 0; i < sched->graph->n_nodes; i++) {
+ if (sched->node_backend_ids[i] != sched->prev_node_backend_ids[i]) {
+ backend_ids_changed = true;
+ break;
+ }
+ }
+ if (!backend_ids_changed) {
+ for (int i = 0; i < sched->graph->n_leafs; i++) {
+ if (sched->leaf_backend_ids[i] != sched->prev_leaf_backend_ids[i]) {
+ backend_ids_changed = true;
+ break;
+ }
+ }
+ }
+
// allocate graph
- if (!ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
+ if (backend_ids_changed || !ggml_gallocr_alloc_graph(sched->galloc, sched->graph)) {
// the re-allocation may cause the split inputs to be moved to a different address
ggml_backend_sched_synchronize(sched);
#ifndef NDEBUG
struct ggml_backend_sched * sched = calloc(1, sizeof(struct ggml_backend_sched));
+ sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
+
// initialize hash table
sched->hash_set = ggml_hash_set_new(graph_size);
sched->tensor_backend_id = calloc(sched->hash_set.size, sizeof(sched->tensor_backend_id[0]));
const size_t nodes_size = graph_size + GGML_SCHED_MAX_SPLITS*GGML_SCHED_MAX_SPLIT_INPUTS*2;
sched->node_backend_ids = calloc(nodes_size, sizeof(sched->node_backend_ids[0]));
sched->leaf_backend_ids = calloc(nodes_size, sizeof(sched->leaf_backend_ids[0]));
+ sched->prev_node_backend_ids = calloc(nodes_size, sizeof(sched->prev_node_backend_ids[0]));
+ sched->prev_leaf_backend_ids = calloc(nodes_size, sizeof(sched->prev_leaf_backend_ids[0]));
sched->n_backends = n_backends;
for (int b = 0; b < n_backends; b++) {
sched->backends[b] = backends[b];
sched->bufts[b] = bufts ? bufts[b] : ggml_backend_get_default_buffer_type(backends[b]);
- GGML_ASSERT(ggml_backend_buft_supports_backend(sched->bufts[b], backends[b]));
+ GGML_ASSERT(ggml_backend_supports_buft(backends[b], sched->bufts[b]));
if (sched->n_copies > 1) {
for (int c = 0; c < sched->n_copies; c++) {
sched->events[b][c] = ggml_backend_event_new(backends[b]);
free(sched->tensor_copies);
free(sched->node_backend_ids);
free(sched->leaf_backend_ids);
+ free(sched->prev_node_backend_ids);
+ free(sched->prev_leaf_backend_ids);
free(sched);
}
int backend_index = ggml_backend_sched_backend_id(sched, backend);
GGML_ASSERT(backend_index >= 0 && backend_index < sched->n_backends);
tensor_backend_id(node) = backend_index;
+ SET_CAUSE(node, "usr");
}
ggml_backend_t ggml_backend_sched_get_tensor_backend(ggml_backend_sched_t sched, struct ggml_tensor * node) {
return ctx->name.c_str();
}
+static bool ggml_backend_buft_is_cuda(ggml_backend_buffer_type_t buft) {
+ return buft->iface.get_name == ggml_backend_cuda_buffer_type_name;
+}
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
GGML_UNUSED(buft);
}
-GGML_CALL static bool ggml_backend_cuda_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
- if (!ggml_backend_is_cuda(backend)) {
- return false;
- }
-
- ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
- return buft_ctx->device == cuda_ctx->device;
-}
-
static ggml_backend_buffer_type_i ggml_backend_cuda_buffer_type_interface = {
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
- /* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
/* .is_host = */ NULL,
};
GGML_UNUSED(buft);
}
+static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
+ return buft->iface.get_name == ggml_backend_cuda_split_buffer_type_name;
+}
+
GGML_CALL static ggml_backend_buffer_t ggml_backend_cuda_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
// instead, we allocate them for each tensor separately in init_tensor
return total_size;
}
-GGML_CALL static bool ggml_backend_cuda_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
- return ggml_backend_is_cuda(backend);
-
- GGML_UNUSED(buft);
-}
-
GGML_CALL static bool ggml_backend_cuda_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false;
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
- /* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
};
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
},
/* .context = */ nullptr,
GGML_UNUSED(backend);
}
+GGML_CALL static bool ggml_backend_cuda_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+ if (ggml_backend_buft_is_cuda_split(buft)) {
+ return true;
+ }
+
+ if (ggml_backend_buft_is_cuda(buft)) {
+ ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
+ return buft_ctx->device == cuda_ctx->device;
+ }
+
+ return false;
+}
+
GGML_CALL static bool ggml_backend_cuda_offload_op(ggml_backend_t backend, const ggml_tensor * op) {
const int min_batch_size = 32;
/* .synchronize = */ ggml_backend_cuda_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
+ /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
/* .supports_op = */ ggml_backend_cuda_supports_op,
+ /* .supports_buft = */ ggml_backend_cuda_supports_buft,
/* .offload_op = */ ggml_backend_cuda_offload_op,
/* .event_new = */ ggml_backend_cuda_event_new,
/* .event_free = */ ggml_backend_cuda_event_free,
return ctx->max_alloc;
}
-static bool ggml_backend_kompute_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
- GGML_UNUSED(buft);
- return ggml_backend_is_kompute(backend);
-}
-
static ggml_backend_buffer_type_i ggml_backend_kompute_buffer_type_interface = {
/* .get_name = */ ggml_backend_kompute_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_kompute_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_kompute_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
- /* .supports_backend = */ ggml_backend_kompute_buffer_type_supports_backend,
/* .is_host = */ NULL,
};
return ggml_vk_supports_op(op);
}
+static bool ggml_backend_kompute_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+ GGML_UNUSED(backend);
+ return buft->iface.get_name == ggml_backend_kompute_buffer_type_get_name;
+}
+
static struct ggml_backend_i kompute_backend_i = {
/* .get_name = */ ggml_backend_kompute_name,
/* .free = */ ggml_backend_kompute_free,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
+ /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
/* .supports_op = */ ggml_backend_kompute_supports_op,
+ /* .supports_buft = */ ggml_backend_kompute_supports_buft,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
UNUSED(buft);
}
-GGML_CALL static bool ggml_backend_metal_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
- return ggml_backend_is_metal(backend) || ggml_backend_is_cpu(backend);
-
- UNUSED(buft);
-}
-
GGML_CALL static bool ggml_backend_metal_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return true;
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
- /* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
},
/* .context = */ NULL,
return ggml_metal_supports_op(metal_ctx, op);
}
+GGML_CALL static bool ggml_backend_metal_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+ return buft->iface.get_name == ggml_backend_metal_buffer_type_get_name;
+
+ UNUSED(backend);
+}
+
static struct ggml_backend_i ggml_backend_metal_i = {
/* .get_name = */ ggml_backend_metal_name,
/* .free = */ ggml_backend_metal_free,
/* .synchronize = */ NULL,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
+ /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_metal_graph_compute,
/* .supports_op = */ ggml_backend_metal_supports_op,
+ /* .supports_buft = */ ggml_backend_metal_supports_buft,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
return ggml_nbytes(tensor);
}
-GGML_CALL static bool ggml_backend_rpc_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
- if (!ggml_backend_is_rpc(backend)) {
- return false;
- }
- ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
- ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
- return buft_ctx->endpoint == rpc_ctx->endpoint;
-}
-
static ggml_backend_buffer_type_i ggml_backend_rpc_buffer_type_interface = {
/* .get_name = */ ggml_backend_rpc_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_rpc_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_rpc_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_rpc_get_max_size,
/* .get_alloc_size = */ ggml_backend_rpc_buffer_type_get_alloc_size,
- /* .supports_backend = */ ggml_backend_rpc_buffer_type_supports_backend,
/* .is_host = */ NULL,
};
return false;
}
+GGML_CALL static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+ if (buft->iface.get_name == ggml_backend_rpc_buffer_type_name) {
+ return false;
+ }
+ ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
+ ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
+ return buft_ctx->endpoint == rpc_ctx->endpoint;
+}
+
static ggml_backend_i ggml_backend_rpc_interface = {
/* .get_name = */ ggml_backend_rpc_name,
/* .free = */ ggml_backend_rpc_free,
/* .synchronize = */ ggml_backend_rpc_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
+ /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
/* .supports_op = */ ggml_backend_rpc_supports_op,
+ /* .supports_buft = */ ggml_backend_rpc_supports_buft,
/* .offload_op = */ NULL,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
UNUSED(buft);
}
-GGML_CALL static bool ggml_backend_sycl_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
- if (!ggml_backend_is_sycl(backend)) {
- return false;
- }
- ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
- ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
- return buft_ctx->device == sycl_ctx->device;
-}
-
static ggml_backend_buffer_type_i ggml_backend_sycl_buffer_type_interface = {
/* .get_name = */ ggml_backend_sycl_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_sycl_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_sycl_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_sycl_buffer_type_get_max_size,
/* .get_alloc_size = */ ggml_backend_sycl_buffer_type_get_alloc_size,
- /* .supports_backend = */ ggml_backend_sycl_buffer_type_supports_backend,
/* .is_host = */ nullptr,
};
return total_size;
}
-GGML_CALL static bool ggml_backend_sycl_split_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
- return ggml_backend_is_sycl(backend);
-
- UNUSED(buft);
-}
-
GGML_CALL static bool ggml_backend_sycl_split_buffer_type_is_host(ggml_backend_buffer_type_t buft) {
return false;
/* .get_alignment = */ ggml_backend_sycl_split_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_sycl_split_buffer_type_get_alloc_size,
- /* .supports_backend = */ ggml_backend_sycl_split_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_sycl_split_buffer_type_is_host,
};
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
/* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
},
/* .context = */ nullptr,
GGML_UNUSED(backend);
}
+GGML_CALL static bool ggml_backend_sycl_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+ if (buft->iface.get_name != ggml_backend_sycl_buffer_type_name) {
+ return false;
+ }
+ ggml_backend_sycl_buffer_type_context * buft_ctx = (ggml_backend_sycl_buffer_type_context *)buft->context;
+ ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
+ return buft_ctx->device == sycl_ctx->device;
+}
static ggml_backend_i ggml_backend_sycl_interface = {
/* .get_name = */ ggml_backend_sycl_name,
/* .synchronize = */ ggml_backend_sycl_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
+ /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
/* .supports_op = */ ggml_backend_sycl_supports_op,
+ /* .supports_buft = */ ggml_backend_sycl_supports_buft,
/* .offload_op = */ ggml_backend_sycl_offload_op,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
UNUSED(buft);
}
-GGML_CALL static bool ggml_backend_vk_buffer_type_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend) {
- if (!ggml_backend_is_vk(backend)) {
- return false;
- }
-
- ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
- ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
-
- return buft_ctx->ctx->idx == ctx->idx;
-}
-
static ggml_backend_buffer_type_i ggml_backend_vk_buffer_type_interface = {
/* .get_name = */ ggml_backend_vk_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_vk_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_vk_buffer_type_get_alignment,
/* .get_max_size = */ ggml_backend_vk_buffer_type_get_max_size,
/* .get_alloc_size = */ ggml_backend_vk_buffer_type_get_alloc_size,
- /* .supports_backend = */ ggml_backend_vk_buffer_type_supports_backend,
/* .is_host = */ NULL,
};
/* .get_alignment = */ ggml_backend_vk_host_buffer_type_get_alignment,
/* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
- /* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
},
/* .context = */ nullptr,
UNUSED(backend);
}
+GGML_CALL static bool ggml_backend_vk_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
+ if (buft->iface.get_name != ggml_backend_vk_buffer_type_name) {
+ return false;
+ }
+
+ ggml_backend_vk_buffer_type_context * buft_ctx = (ggml_backend_vk_buffer_type_context *)buft->context;
+ ggml_backend_vk_context * ctx = (ggml_backend_vk_context *)backend->context;
+
+ return buft_ctx->ctx->idx == ctx->idx;
+}
+
// TODO: enable async and synchronize
static ggml_backend_i ggml_backend_vk_interface = {
/* .get_name = */ ggml_backend_vk_name,
/* .synchronize = */ NULL, // ggml_backend_vk_synchronize,
/* .graph_plan_create = */ NULL,
/* .graph_plan_free = */ NULL,
+ /* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_vk_graph_compute,
/* .supports_op = */ ggml_backend_vk_supports_op,
+ /* .supports_buft = */ ggml_backend_vk_supports_buft,
/* .offload_op = */ ggml_backend_vk_offload_op,
/* .event_new = */ NULL,
/* .event_free = */ NULL,
#if defined(GGML_USE_ACCELERATE)
#include <Accelerate/Accelerate.h>
-#elif defined(GGML_USE_OPENBLAS)
-#if defined(GGML_BLAS_USE_MKL)
-#include <mkl.h>
-#else
-#include <cblas.h>
-#endif
#endif
// floating point type used to accumulate sums
// ggml_compute_forward_mul_mat
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-// helper function to determine if it is better to use BLAS or not
-// for large matrices, BLAS is faster
-static bool ggml_compute_forward_mul_mat_use_blas(struct ggml_tensor * dst) {
- const struct ggml_tensor * src0 = dst->src[0];
- const struct ggml_tensor * src1 = dst->src[1];
-
- //const int64_t ne00 = src0->ne[0];
- //const int64_t ne01 = src0->ne[1];
-
- const int64_t ne10 = src1->ne[0];
-
- const int64_t ne0 = dst->ne[0];
- const int64_t ne1 = dst->ne[1];
-
- // NOTE: with GGML_OP_MUL_MAT_ID we don't want to go through the BLAS branch because it will dequantize (to_float)
- // all the experts for each batch element and the processing would become incredibly slow
- // TODO: find the optimal values for these
- if (dst->op != GGML_OP_MUL_MAT_ID &&
- ggml_is_contiguous(src0) &&
- ggml_is_contiguous(src1) &&
- //src0->type == GGML_TYPE_F32 &&
- src1->type == GGML_TYPE_F32 &&
- (ne0 >= 32 && ne1 >= 32 && ne10 >= 32)) {
-
- /*printf("BLAS: %d %d %d %d %d\n", ne0, ne1, ne10, ne00, ne01);*/
- return true;
- }
-
- return false;
-}
-#endif
-
static void ggml_compute_forward_mul_mat_one_chunk(
const struct ggml_compute_params * params,
struct ggml_tensor * dst,
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
- if (ggml_compute_forward_mul_mat_use_blas(dst)) {
- const int64_t ne_plane = ne01*ne00;
- const size_t desired_wsize = ne13*ne12*ne_plane*sizeof(float);
- UNUSED(desired_wsize);
-
- if (params->type == GGML_TASK_TYPE_INIT) {
- if (type != GGML_TYPE_F32) {
- assert(params->wsize >= desired_wsize);
- // parallelize by src0 rows
- for (int64_t i13 = 0; i13 < ne13; i13++) {
- for (int64_t i12 = 0; i12 < ne12; i12++) {
- // broadcast src0 into src1 across 2nd,3rd dimension
- const int64_t i03 = i13/r3;
- const int64_t i02 = i12/r2;
-
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
- float * const wdata = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
- ggml_to_float_t const to_float = type_traits[type].to_float;
-
- for (int64_t i01 = ith; i01 < ne01; i01 += nth) {
- to_float((const char *) x + i01*nb01, wdata + i01*ne00, ne00);
- }
- }
- }
- }
- return;
- }
-
- if (params->type == GGML_TASK_TYPE_FINALIZE) {
- return;
- }
-
- // perform sgemm, parallelization controlled by blas lib
- if (ith != 0) {
- return;
- }
-
- //const int64_t tgemm0 = ggml_perf_time_us();
- for (int64_t i13 = 0; i13 < ne13; i13++) {
- for (int64_t i12 = 0; i12 < ne12; i12++) {
- const int64_t i03 = i13/r3;
- const int64_t i02 = i12/r2;
-
- const void * x = (char *) src0->data + i02*nb02 + i03*nb03;
- const float * y = (float *) ((char *) src1->data + i12*nb12 + i13*nb13);
- float * d = (float *) ((char *) dst->data + i12*nb2 + i13*nb3);
-
- if (type != GGML_TYPE_F32) {
- x = (float *) params->wdata + i13*ne12*ne_plane + i12*ne_plane;
- }
-
- cblas_sgemm(CblasRowMajor, CblasNoTrans, CblasTrans,
- ne1, ne01, ne10,
- 1.0f, y, ne10,
- x, ne00,
- 0.0f, d, ne01);
- }
- }
- //printf("cblas_sgemm = %.3f ms, %lld flops\n", (ggml_perf_time_us() - tgemm0)/1000.0, ne13*ne12*ne1*ne01*ne10*2);
-
- //printf("CBLAS = %f ms, %d x %d x %d x %d\n", (ggml_perf_time_us() - t0)/1000.0, ne0, ne1, ne2, ne3);
-
- return;
- }
-#endif
-
#if GGML_USE_LLAMAFILE
const bool src1_cont = ggml_is_contiguous(src1);
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
- bool use_blas = ggml_is_matrix(src0) &&
- ggml_is_matrix(src1) &&
- ggml_is_contiguous(src0) &&
- (ggml_is_contiguous(src1) || ggml_is_transposed(src1));
-#endif
-
if (params->type == GGML_TASK_TYPE_INIT) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) // gemm beta will zero dst
- if (use_blas) {
- return;
- }
-#endif
if (ith != 0) {
return;
}
return;
}
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
- if (use_blas) {
- if (params->ith != 0) { // All threads other than the first do no work.
- return;
- }
- // Arguments to ggml_compute_forward_out_prod (expressed as major,minor)
- // src0: (k,n)
- // src1: (k,m)
- // dst: (m,n)
- //
- // Arguments to sgemm (see https://github.com/Reference-LAPACK/lapack/blob/master/BLAS/SRC/sgemm.f)
- // Also expressed as (major,minor)
- // a: (m,k): so src1 transposed
- // b: (k,n): so src0
- // c: (m,n)
- //
- // However, if ggml_is_transposed(src1) is true, then
- // src1->data already contains a transposed version, so sgemm mustn't
- // transpose it further.
-
- int n = src0->ne[0];
- int k = src0->ne[1];
- int m = src1->ne[0];
-
- int transposeA, lda;
-
- if (!ggml_is_transposed(src1)) {
- transposeA = CblasTrans;
- lda = m;
- } else {
- transposeA = CblasNoTrans;
- lda = k;
- }
-
- float * a = (float *) ((char *) src1->data);
- float * b = (float *) ((char *) src0->data);
- float * c = (float *) ((char *) dst->data);
-
- cblas_sgemm(CblasRowMajor, transposeA, CblasNoTrans, m, n, k, 1.0, a, lda, b, n, 0.0, c, n);
-
- return;
- }
-#endif
-
// dst[:,:,:,:] = 0
// for i2,i3:
// for i1:
// nb01 >= nb00 - src0 is not transposed
// compute by src0 rows
- // TODO: #if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
-
if (params->type == GGML_TASK_TYPE_INIT) {
if (ith != 0) {
return;
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+ assert(i01 >= 0 && i01 < ne01);
+
dequantize_row_q(
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+ assert(i01 >= 0 && i01 < ne01);
+
ggml_fp16_to_fp32_row(
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
- ggml_bf16_to_fp32_row(
+ assert(i01 >= 0 && i01 < ne01);
+
+ ggml_bf16_to_fp32_row(
(const void *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03),
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3), nc);
}
const int64_t i10 = (i - i12*ne11*ne10 - i11*ne10);
const int64_t i01 = *(int32_t *) ((char *) src1->data + i10*nb10 + i11*nb11 + i12*nb12);
+ assert(i01 >= 0 && i01 < ne01);
+
ggml_vec_cpy_f32(nc,
(float *) ((char *) dst->data + i10*nb1 + i11*nb2 + i12*nb3),
(float *) ((char *) src0->data + i01*nb01 + i11*nb02 + i12*nb03));
switch (node->op) {
case GGML_OP_CPY:
case GGML_OP_DUP:
+ case GGML_OP_CONT:
case GGML_OP_ADD:
case GGML_OP_ADD1:
case GGML_OP_ACC:
} break;
case GGML_OP_SCALE:
case GGML_OP_SET:
- case GGML_OP_CONT:
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
sched_yield();
}
- * node_n = atomic_load(&state->shared->node_n);
- if (* node_n != last_node_n) break;
+ *node_n = atomic_load(&state->shared->node_n);
+ if (*node_n != last_node_n) {
+ break;
+ }
+
#if defined(__SSE3__)
// Tell the processor we're spinning. It's a processor hint for spinlocks.
_mm_pause();
static void ggml_graph_compute_thread_sync_task(int * task_phase, struct ggml_compute_state * state, const bool do_yield) {
// wait for other threads to finish
- const int last_task_phase = * task_phase;
+ const int last_task_phase = *task_phase;
while (true) {
if (do_yield) {
sched_yield();
}
- * task_phase = atomic_load(&state->shared->node_task);
- if (* task_phase != last_task_phase) break;
+ *task_phase = atomic_load(&state->shared->node_task);
+ if (*task_phase != last_task_phase) {
+ break;
+ }
+
#if defined(__SSE3__)
// Tell the processor we're spinning. It's a processor hint for spinlocks.
_mm_pause();
{
const enum ggml_type vec_dot_type = type_traits[node->src[0]->type].vec_dot_type;
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS)
- if (ggml_compute_forward_mul_mat_use_blas(node)) {
- if (node->src[0]->type != GGML_TYPE_F32) {
- // here we need memory for fully dequantized matrix from src0
- // take into account that src0 can be broadcasted into src1[2,3]
- cur = ggml_type_size(GGML_TYPE_F32)
- * node->src[0]->ne[0]*node->src[0]->ne[1]
- * node->src[1]->ne[2]*node->src[1]->ne[3];
- }
- } else
-#endif
if (node->src[1]->type != vec_dot_type) {
cur = ggml_row_size(vec_dot_type, ggml_nelements(node->src[1]));
}
}
int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_BLAS) || defined(GGML_USE_CUDA) || defined(GGML_USE_VULKAN) || defined(GGML_USE_SYCL)
return 1;
#else
return 0;