Rewrite simple-backend to use sched and ggml_backend_load_all (#1376)

author Jeff Bolz <redacted>

Wed, 29 Oct 2025 17:10:19 +0000 (12:10 -0500)

committer GitHub <redacted>

Wed, 29 Oct 2025 17:10:19 +0000 (18:10 +0100)
author Jeff Bolz <redacted>
Wed, 29 Oct 2025 17:10:19 +0000 (12:10 -0500)
committer GitHub <redacted>
Wed, 29 Oct 2025 17:10:19 +0000 (18:10 +0100)
diff --git a/examples/simple/simple-backend.cpp b/examples/simple/simple-backend.cpp

index ffbe1ad771a6a2a818abc89eaa47ff6d3edbb00f..c7292b90177b949e65f90d62b8437030b37d1400 100644 (file)
--- a/examples/simple/simple-backend.cpp
+++ b/examples/simple/simple-backend.cpp
@@ -1,23 +1,11 @@
  #include "ggml.h"
-#include "ggml-cpu.h"
-#include "ggml-alloc.h"
  #include "ggml-backend.h"
  
-#ifdef GGML_USE_CUDA
-#include "ggml-cuda.h"
-#endif
-
-#ifdef GGML_USE_METAL
-#include "ggml-metal.h"
-#endif
-
  #include <cassert>
  #include <cmath>
  #include <cstdio>
  #include <cstring>
  #include <fstream>
-#include <map>
-#include <string>
  #include <vector>
  
  static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
@@ -29,111 +17,95 @@ static void ggml_log_callback_default(ggml_log_level level, const char * text, v
  
  // This is a simple model with two tensors a and b
  struct simple_model {
-    struct ggml_tensor * a;
-    struct ggml_tensor * b;
+    struct ggml_tensor * a {};
+    struct ggml_tensor * b {};
  
      // the backend to perform the computation (CPU, CUDA, METAL)
-    ggml_backend_t backend = NULL;
-
-    // the backend buffer to storage the tensors data of a and b
-    ggml_backend_buffer_t buffer;
+    ggml_backend_t backend {};
+    ggml_backend_t cpu_backend {};
+    ggml_backend_sched_t sched {};
  
-    // the context to define the tensor information (dimensions, size, memory address)
-    struct ggml_context * ctx;
+    // storage for the graph and tensors
+    std::vector<uint8_t> buf;
  };
  
-// initialize the tensors of the model in this case two matrices 2x2
-void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B) {
-    ggml_log_set(ggml_log_callback_default, nullptr);
-
-    // initialize the backend
-#ifdef GGML_USE_CUDA
-    fprintf(stderr, "%s: using CUDA backend\n", __func__);
-    model.backend = ggml_backend_cuda_init(0); // init device 0
-    if (!model.backend) {
-        fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
-    }
-#endif
+// initialize data of matrices to perform matrix multiplication
+const int rows_A = 4, cols_A = 2;
  
-#ifdef GGML_USE_METAL
-    fprintf(stderr, "%s: using Metal backend\n", __func__);
-    model.backend = ggml_backend_metal_init();
-    if (!model.backend) {
-        fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
-    }
-#endif
-
-    // if there aren't GPU Backends fallback to CPU backend
-    if (!model.backend) {
-        model.backend = ggml_backend_cpu_init();
-    }
+float matrix_A[rows_A * cols_A] = {
+    2, 8,
+    5, 1,
+    4, 2,
+    8, 6
+};
  
-    int num_tensors = 2;
+const int rows_B = 3, cols_B = 2;
+/* Transpose([
+    10, 9, 5,
+    5, 9, 4
+]) 2 rows, 3 cols */
+float matrix_B[rows_B * cols_B] = {
+    10, 5,
+    9, 9,
+    5, 4
+};
  
-    struct ggml_init_params params {
-            /*.mem_size   =*/ ggml_tensor_overhead() * num_tensors,
-            /*.mem_buffer =*/ NULL,
-            /*.no_alloc   =*/ true,
-    };
  
-    // create context
-    model.ctx = ggml_init(params);
+// initialize the tensors of the model in this case two matrices 2x2
+void init_model(simple_model & model) {
+    ggml_log_set(ggml_log_callback_default, nullptr);
  
-    // create tensors
-    model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A);
-    model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B);
+    ggml_backend_load_all();
  
-    // create a backend buffer (backend memory) and alloc the tensors from the context
-    model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend);
+    model.backend = ggml_backend_init_best();
+    model.cpu_backend = ggml_backend_init_by_type(GGML_BACKEND_DEVICE_TYPE_CPU, nullptr);
  
-    // load data from cpu memory to backend buffer
-    ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a));
-    ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b));
+    ggml_backend_t backends[2] = { model.backend, model.cpu_backend };
+    model.sched = ggml_backend_sched_new(backends, nullptr, 2, GGML_DEFAULT_GRAPH_SIZE, false, true);
  }
  
  // build the compute graph to perform a matrix multiplication
-struct ggml_cgraph * build_graph(const simple_model& model) {
-    static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
-    static std::vector<uint8_t> buf(buf_size);
+struct ggml_cgraph * build_graph(simple_model& model) {
+    size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead();
+    model.buf.resize(buf_size);
  
      struct ggml_init_params params0 = {
          /*.mem_size   =*/ buf_size,
-        /*.mem_buffer =*/ buf.data(),
-        /*.no_alloc   =*/ true, // the tensors will be allocated later by ggml_gallocr_alloc_graph()
+        /*.mem_buffer =*/ model.buf.data(),
+        /*.no_alloc   =*/ true, // the tensors will be allocated later
      };
  
-    // create a temporally context to build the graph
-    struct ggml_context * ctx0 = ggml_init(params0);
+    // create a context to build the graph
+    struct ggml_context * ctx = ggml_init(params0);
  
-    struct ggml_cgraph  * gf = ggml_new_graph(ctx0);
+    struct ggml_cgraph  * gf = ggml_new_graph(ctx);
+
+    // create tensors
+    model.a = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, cols_A, rows_A);
+    model.b = ggml_new_tensor_2d(ctx, GGML_TYPE_F32, cols_B, rows_B);
  
      // result = a*b^T
-    struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, model.b);
+    struct ggml_tensor * result = ggml_mul_mat(ctx, model.a, model.b);
  
      // build operations nodes
      ggml_build_forward_expand(gf, result);
  
-    // delete the temporally context used to build the graph
-    ggml_free(ctx0);
+    ggml_free(ctx);
+
      return gf;
  }
  
  // compute with backend
-struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr) {
-    // reset the allocator to free all the memory allocated during the previous inference
-
-    struct ggml_cgraph * gf = build_graph(model);
-
-    // allocate tensors
-    ggml_gallocr_alloc_graph(allocr, gf);
-
-    int n_threads = 1; // number of threads to perform some operations with multi-threading
+struct ggml_tensor * compute(simple_model & model, struct ggml_cgraph * gf) {
+    ggml_backend_sched_reset(model.sched);
+    ggml_backend_sched_alloc_graph(model.sched, gf);
  
-    if (ggml_backend_is_cpu(model.backend)) {
-        ggml_backend_cpu_set_n_threads(model.backend, n_threads);
-    }
+    // load data from cpu memory to backend buffer
+    ggml_backend_tensor_set(model.a, matrix_A, 0, ggml_nbytes(model.a));
+    ggml_backend_tensor_set(model.b, matrix_B, 0, ggml_nbytes(model.b));
  
-    ggml_backend_graph_compute(model.backend, gf);
+    // compute the graph
+    ggml_backend_sched_graph_compute(model.sched, gf);
  
      // in this case, the output tensor is the last one in the graph
      return ggml_graph_node(gf, -1);
@@ -142,46 +114,13 @@ struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr)
  int main(void) {
      ggml_time_init();
  
-    // initialize data of matrices to perform matrix multiplication
-    const int rows_A = 4, cols_A = 2;
-
-    float matrix_A[rows_A * cols_A] = {
-        2, 8,
-        5, 1,
-        4, 2,
-        8, 6
-    };
-
-    const int rows_B = 3, cols_B = 2;
-    /* Transpose([
-        10, 9, 5,
-        5, 9, 4
-    ]) 2 rows, 3 cols */
-    float matrix_B[rows_B * cols_B] = {
-        10, 5,
-        9, 9,
-        5, 4
-    };
-
      simple_model model;
-    load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B);
-
-    // calculate the temporaly memory required to compute
-    ggml_gallocr_t allocr = NULL;
-
-    {
-        allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend));
-
-        // create the worst case graph for memory usage estimation
-        struct ggml_cgraph * gf = build_graph(model);
-        ggml_gallocr_reserve(allocr, gf);
-        size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0);
+    init_model(model);
  
-        fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0);
-    }
+    struct ggml_cgraph * gf = build_graph(model);
  
      // perform computation
-    struct ggml_tensor * result = compute(model, allocr);
+    struct ggml_tensor * result = compute(model, gf);
  
      // create a array to print result
      std::vector<float> out_data(ggml_nelements(result));
@@ -206,14 +145,9 @@ int main(void) {
      }
      printf(" ]\n");
  
-    // release backend memory used for computation
-    ggml_gallocr_free(allocr);
-
-    // free memory
-    ggml_free(model.ctx);
-
      // release backend memory and free backend
-    ggml_backend_buffer_free(model.buffer);
+    ggml_backend_sched_free(model.sched);
      ggml_backend_free(model.backend);
+    ggml_backend_free(model.cpu_backend);
      return 0;
  }
author	Jeff Bolz <redacted>
	Wed, 29 Oct 2025 17:10:19 +0000 (12:10 -0500)
committer	GitHub <redacted>
	Wed, 29 Oct 2025 17:10:19 +0000 (18:10 +0100)