From: Jeff Bolz Date: Wed, 29 Oct 2025 17:10:19 +0000 (-0500) Subject: Rewrite simple-backend to use sched and ggml_backend_load_all (#1376) X-Git-Tag: upstream/0.9.4.185~97 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=e02fb860ccbba8967905bceff23b677e88105280;p=pkg%2Fggml%2Fsources%2Fggml Rewrite simple-backend to use sched and ggml_backend_load_all (#1376) * Rewrite simple-backend to use sched and ggml_backend_load_all * address slaren's feedback * move the storage to the model class --- diff --git a/examples/simple/simple-backend.cpp b/examples/simple/simple-backend.cpp index ffbe1ad7..c7292b90 100644 --- a/examples/simple/simple-backend.cpp +++ b/examples/simple/simple-backend.cpp @@ -1,23 +1,11 @@ #include "ggml.h" -#include "ggml-cpu.h" -#include "ggml-alloc.h" #include "ggml-backend.h" -#ifdef GGML_USE_CUDA -#include "ggml-cuda.h" -#endif - -#ifdef GGML_USE_METAL -#include "ggml-metal.h" -#endif - #include #include #include #include #include -#include -#include #include static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) { @@ -29,111 +17,95 @@ static void ggml_log_callback_default(ggml_log_level level, const char * text, v // This is a simple model with two tensors a and b struct simple_model { - struct ggml_tensor * a; - struct ggml_tensor * b; + struct ggml_tensor * a {}; + struct ggml_tensor * b {}; // the backend to perform the computation (CPU, CUDA, METAL) - ggml_backend_t backend = NULL; - - // the backend buffer to storage the tensors data of a and b - ggml_backend_buffer_t buffer; + ggml_backend_t backend {}; + ggml_backend_t cpu_backend {}; + ggml_backend_sched_t sched {}; - // the context to define the tensor information (dimensions, size, memory address) - struct ggml_context * ctx; + // storage for the graph and tensors + std::vector buf; }; -// initialize the tensors of the model in this case two matrices 2x2 -void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B) { - ggml_log_set(ggml_log_callback_default, nullptr); - - // initialize the backend -#ifdef GGML_USE_CUDA - fprintf(stderr, "%s: using CUDA backend\n", __func__); - model.backend = ggml_backend_cuda_init(0); // init device 0 - if (!model.backend) { - fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); - } -#endif +// initialize data of matrices to perform matrix multiplication +const int rows_A = 4, cols_A = 2; -#ifdef GGML_USE_METAL - fprintf(stderr, "%s: using Metal backend\n", __func__); - model.backend = ggml_backend_metal_init(); - if (!model.backend) { - fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); - } -#endif - - // if there aren't GPU Backends fallback to CPU backend - if (!model.backend) { - model.backend = ggml_backend_cpu_init(); - } +float matrix_A[rows_A * cols_A] = { + 2, 8, + 5, 1, + 4, 2, + 8, 6 +}; - int num_tensors = 2; +const int rows_B = 3, cols_B = 2; +/* Transpose([ + 10, 9, 5, + 5, 9, 4 +]) 2 rows, 3 cols */ +float matrix_B[rows_B * cols_B] = { + 10, 5, + 9, 9, + 5, 4 +}; - struct ggml_init_params params { - /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, - /*.mem_buffer =*/ NULL, - /*.no_alloc =*/ true, - }; - // create context - model.ctx = ggml_init(params); +// initialize the tensors of the model in this case two matrices 2x2 +void init_model(simple_model & model) { + ggml_log_set(ggml_log_callback_default, nullptr); - // create tensors - model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A); - model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B); + ggml_backend_load_all(); - // create a backend buffer (backend memory) and alloc the tensors from the context - model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend); + model.backend = ggml_backend_init_best(); + model.cpu_backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr); - // load data from cpu memory to backend buffer - ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a)); - ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b)); + ggml_backend_t backends[2] = { model.backend, model.cpu_backend }; + model.sched = ggml_backend_sched_new(backends, nullptr, 2, GGML_DEFAULT_GRAPH_SIZE, false, true); } // build the compute graph to perform a matrix multiplication -struct ggml_cgraph * build_graph(const simple_model& model) { - static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); - static std::vector buf(buf_size); +struct ggml_cgraph * build_graph(simple_model& model) { + size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + model.buf.resize(buf_size); struct ggml_init_params params0 = { /*.mem_size =*/ buf_size, - /*.mem_buffer =*/ buf.data(), - /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph() + /*.mem_buffer =*/ model.buf.data(), + /*.no_alloc =*/ true, // the tensors will be allocated later }; - // create a temporally context to build the graph - struct ggml_context * ctx0 = ggml_init(params0); + // create a context to build the graph + struct ggml_context * ctx = ggml_init(params0); - struct ggml_cgraph * gf = ggml_new_graph(ctx0); + struct ggml_cgraph * gf = ggml_new_graph(ctx); + + // create tensors + model.a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, cols_A, rows_A); + model.b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, cols_B, rows_B); // result = a*b^T - struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, model.b); + struct ggml_tensor * result = ggml_mul_mat(ctx, model.a, model.b); // build operations nodes ggml_build_forward_expand(gf, result); - // delete the temporally context used to build the graph - ggml_free(ctx0); + ggml_free(ctx); + return gf; } // compute with backend -struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr) { - // reset the allocator to free all the memory allocated during the previous inference - - struct ggml_cgraph * gf = build_graph(model); - - // allocate tensors - ggml_gallocr_alloc_graph(allocr, gf); - - int n_threads = 1; // number of threads to perform some operations with multi-threading +struct ggml_tensor * compute(simple_model & model, struct ggml_cgraph * gf) { + ggml_backend_sched_reset(model.sched); + ggml_backend_sched_alloc_graph(model.sched, gf); - if (ggml_backend_is_cpu(model.backend)) { - ggml_backend_cpu_set_n_threads(model.backend, n_threads); - } + // load data from cpu memory to backend buffer + ggml_backend_tensor_set(model.a, matrix_A, 0, ggml_nbytes(model.a)); + ggml_backend_tensor_set(model.b, matrix_B, 0, ggml_nbytes(model.b)); - ggml_backend_graph_compute(model.backend, gf); + // compute the graph + ggml_backend_sched_graph_compute(model.sched, gf); // in this case, the output tensor is the last one in the graph return ggml_graph_node(gf, -1); @@ -142,46 +114,13 @@ struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr) int main(void) { ggml_time_init(); - // initialize data of matrices to perform matrix multiplication - const int rows_A = 4, cols_A = 2; - - float matrix_A[rows_A * cols_A] = { - 2, 8, - 5, 1, - 4, 2, - 8, 6 - }; - - const int rows_B = 3, cols_B = 2; - /* Transpose([ - 10, 9, 5, - 5, 9, 4 - ]) 2 rows, 3 cols */ - float matrix_B[rows_B * cols_B] = { - 10, 5, - 9, 9, - 5, 4 - }; - simple_model model; - load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B); - - // calculate the temporaly memory required to compute - ggml_gallocr_t allocr = NULL; - - { - allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); - - // create the worst case graph for memory usage estimation - struct ggml_cgraph * gf = build_graph(model); - ggml_gallocr_reserve(allocr, gf); - size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0); + init_model(model); - fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0); - } + struct ggml_cgraph * gf = build_graph(model); // perform computation - struct ggml_tensor * result = compute(model, allocr); + struct ggml_tensor * result = compute(model, gf); // create a array to print result std::vector out_data(ggml_nelements(result)); @@ -206,14 +145,9 @@ int main(void) { } printf(" ]\n"); - // release backend memory used for computation - ggml_gallocr_free(allocr); - - // free memory - ggml_free(model.ctx); - // release backend memory and free backend - ggml_backend_buffer_free(model.buffer); + ggml_backend_sched_free(model.sched); ggml_backend_free(model.backend); + ggml_backend_free(model.cpu_backend); return 0; }