#include "ggml.h"
-#include "ggml-cpu.h"
-#include "ggml-alloc.h"
#include "ggml-backend.h"
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
#include <cassert>
#include <cmath>
#include <cstdio>
#include <cstring>
#include <fstream>
-#include <map>
-#include <string>
#include <vector>
static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
// This is a simple model with two tensors a and b
struct simple_model {
- struct ggml_tensor * a;
- struct ggml_tensor * b;
+ struct ggml_tensor * a {};
+ struct ggml_tensor * b {};
// the backend to perform the computation (CPU, CUDA, METAL)
- ggml_backend_t backend = NULL;
-
- // the backend buffer to storage the tensors data of a and b
- ggml_backend_buffer_t buffer;
+ ggml_backend_t backend {};
+ ggml_backend_t cpu_backend {};
+ ggml_backend_sched_t sched {};
- // the context to define the tensor information (dimensions, size, memory address)
- struct ggml_context * ctx;
+ // storage for the graph and tensors
+ std::vector<uint8_t> buf;
};
-// initialize the tensors of the model in this case two matrices 2x2
-void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B) {
- ggml_log_set(ggml_log_callback_default, nullptr);
-
- // initialize the backend
-#ifdef GGML_USE_CUDA
- fprintf(stderr, "%s: using CUDA backend\n", __func__);
- model.backend = ggml_backend_cuda_init(0); // init device 0
- if (!model.backend) {
- fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
- }
-#endif
+// initialize data of matrices to perform matrix multiplication
+const int rows_A = 4, cols_A = 2;
-#ifdef GGML_USE_METAL
- fprintf(stderr, "%s: using Metal backend\n", __func__);
- model.backend = ggml_backend_metal_init();
- if (!model.backend) {
- fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
- }
-#endif
-
- // if there aren't GPU Backends fallback to CPU backend
- if (!model.backend) {
- model.backend = ggml_backend_cpu_init();
- }
+float matrix_A[rows_A * cols_A] = {
+ 2, 8,
+ 5, 1,
+ 4, 2,
+ 8, 6
+};
- int num_tensors = 2;
+const int rows_B = 3, cols_B = 2;
+/* Transpose([
+ 10, 9, 5,
+ 5, 9, 4
+]) 2 rows, 3 cols */
+float matrix_B[rows_B * cols_B] = {
+ 10, 5,
+ 9, 9,
+ 5, 4
+};
- struct ggml_init_params params {
- /*.mem_size =*/ ggml_tensor_overhead() * num_tensors,
- /*.mem_buffer =*/ NULL,
- /*.no_alloc =*/ true,
- };
- // create context
- model.ctx = ggml_init(params);
+// initialize the tensors of the model in this case two matrices 2x2
+void init_model(simple_model & model) {
+ ggml_log_set(ggml_log_callback_default, nullptr);
- // create tensors
- model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A);
- model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B);
+ ggml_backend_load_all();
- // create a backend buffer (backend memory) and alloc the tensors from the context
- model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend);
+ model.backend = ggml_backend_init_best();
+ model.cpu_backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
- // load data from cpu memory to backend buffer
- ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a));
- ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b));
+ ggml_backend_t backends[2] = { model.backend, model.cpu_backend };
+ model.sched = ggml_backend_sched_new(backends, nullptr, 2, GGML_DEFAULT_GRAPH_SIZE, false, true);
}
// build the compute graph to perform a matrix multiplication
-struct ggml_cgraph * build_graph(const simple_model& model) {
- static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
- static std::vector<uint8_t> buf(buf_size);
+struct ggml_cgraph * build_graph(simple_model& model) {
+ size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+ model.buf.resize(buf_size);
struct ggml_init_params params0 = {
/*.mem_size =*/ buf_size,
- /*.mem_buffer =*/ buf.data(),
- /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
+ /*.mem_buffer =*/ model.buf.data(),
+ /*.no_alloc =*/ true, // the tensors will be allocated later
};
- // create a temporally context to build the graph
- struct ggml_context * ctx0 = ggml_init(params0);
+ // create a context to build the graph
+ struct ggml_context * ctx = ggml_init(params0);
- struct ggml_cgraph * gf = ggml_new_graph(ctx0);
+ struct ggml_cgraph * gf = ggml_new_graph(ctx);
+
+ // create tensors
+ model.a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, cols_A, rows_A);
+ model.b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, cols_B, rows_B);
// result = a*b^T
- struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, model.b);
+ struct ggml_tensor * result = ggml_mul_mat(ctx, model.a, model.b);
// build operations nodes
ggml_build_forward_expand(gf, result);
- // delete the temporally context used to build the graph
- ggml_free(ctx0);
+ ggml_free(ctx);
+
return gf;
}
// compute with backend
-struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr) {
- // reset the allocator to free all the memory allocated during the previous inference
-
- struct ggml_cgraph * gf = build_graph(model);
-
- // allocate tensors
- ggml_gallocr_alloc_graph(allocr, gf);
-
- int n_threads = 1; // number of threads to perform some operations with multi-threading
+struct ggml_tensor * compute(simple_model & model, struct ggml_cgraph * gf) {
+ ggml_backend_sched_reset(model.sched);
+ ggml_backend_sched_alloc_graph(model.sched, gf);
- if (ggml_backend_is_cpu(model.backend)) {
- ggml_backend_cpu_set_n_threads(model.backend, n_threads);
- }
+ // load data from cpu memory to backend buffer
+ ggml_backend_tensor_set(model.a, matrix_A, 0, ggml_nbytes(model.a));
+ ggml_backend_tensor_set(model.b, matrix_B, 0, ggml_nbytes(model.b));
- ggml_backend_graph_compute(model.backend, gf);
+ // compute the graph
+ ggml_backend_sched_graph_compute(model.sched, gf);
// in this case, the output tensor is the last one in the graph
return ggml_graph_node(gf, -1);
int main(void) {
ggml_time_init();
- // initialize data of matrices to perform matrix multiplication
- const int rows_A = 4, cols_A = 2;
-
- float matrix_A[rows_A * cols_A] = {
- 2, 8,
- 5, 1,
- 4, 2,
- 8, 6
- };
-
- const int rows_B = 3, cols_B = 2;
- /* Transpose([
- 10, 9, 5,
- 5, 9, 4
- ]) 2 rows, 3 cols */
- float matrix_B[rows_B * cols_B] = {
- 10, 5,
- 9, 9,
- 5, 4
- };
-
simple_model model;
- load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B);
-
- // calculate the temporaly memory required to compute
- ggml_gallocr_t allocr = NULL;
-
- {
- allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
-
- // create the worst case graph for memory usage estimation
- struct ggml_cgraph * gf = build_graph(model);
- ggml_gallocr_reserve(allocr, gf);
- size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);
+ init_model(model);
- fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0);
- }
+ struct ggml_cgraph * gf = build_graph(model);
// perform computation
- struct ggml_tensor * result = compute(model, allocr);
+ struct ggml_tensor * result = compute(model, gf);
// create a array to print result
std::vector<float> out_data(ggml_nelements(result));
}
printf(" ]\n");
- // release backend memory used for computation
- ggml_gallocr_free(allocr);
-
- // free memory
- ggml_free(model.ctx);
-
// release backend memory and free backend
- ggml_backend_buffer_free(model.buffer);
+ ggml_backend_sched_free(model.sched);
ggml_backend_free(model.backend);
+ ggml_backend_free(model.cpu_backend);
return 0;
}