From: Steward Garcia Date: Wed, 28 Feb 2024 16:40:12 +0000 (-0500) Subject: ggml : add simple example (#713) X-Git-Tag: upstream/0.0.1642~897 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=d9cd6b57099711eabeeda2eee8a116c64b7542dd;p=pkg%2Fggml%2Fsources%2Fggml ggml : add simple example (#713) * add simple example for explain memory management and basic operation of ggml --- diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index d3bf460b..66682161 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -24,4 +24,5 @@ add_subdirectory(whisper) add_subdirectory(mnist) add_subdirectory(sam) add_subdirectory(yolo) +add_subdirectory(simple) add_subdirectory(magika) diff --git a/examples/simple/CMakeLists.txt b/examples/simple/CMakeLists.txt new file mode 100644 index 00000000..4fbdf0f5 --- /dev/null +++ b/examples/simple/CMakeLists.txt @@ -0,0 +1,21 @@ +# +# simple-ctx + +set(TEST_TARGET simple-ctx) +add_executable(${TEST_TARGET} simple-ctx.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) + +# +# simple-backend + +set(TEST_TARGET simple-backend) +add_executable(${TEST_TARGET} simple-backend.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) + +if (GGML_CUBLAS) + add_compile_definitions(GGML_USE_CUBLAS) +endif() + +if (GGML_METAL) + add_compile_definitions(GGML_USE_METAL) +endif() diff --git a/examples/simple/README.md b/examples/simple/README.md new file mode 100644 index 00000000..ba3a4fd1 --- /dev/null +++ b/examples/simple/README.md @@ -0,0 +1,25 @@ +## Simple + +This example simply performs a matrix multiplication, solely for the purpose of demonstrating a basic usage of ggml and backend handling. The code is commented to help understand what each part does. + +$$ +\begin{bmatrix} +2 & 8 \\ +5 & 1 \\ +4 & 2 \\ +8 & 6 \\ +\end{bmatrix} +\times +\begin{bmatrix} +10 & 9 & 5 \\ +5 & 9 & 4 \\ +\end{bmatrix} +\= +\begin{bmatrix} +60 & 110 & 54 & 29 \\ +55 & 90 & 126 & 28 \\ +50 & 54 & 42 & 64 \\ +\end{bmatrix} +$$ + +The `simple-ctx` doesn't support gpu acceleration. `simple-backend` demonstrates how to use other backends like CUDA and Metal. diff --git a/examples/simple/simple-backend.cpp b/examples/simple/simple-backend.cpp new file mode 100644 index 00000000..989de7a6 --- /dev/null +++ b/examples/simple/simple-backend.cpp @@ -0,0 +1,216 @@ +#include "ggml.h" +#include "ggml/ggml-alloc.h" +#include "ggml/ggml-backend.h" + +#ifdef GGML_USE_CUBLAS +#include "ggml-cuda.h" +#endif + +#ifdef GGML_USE_METAL +#include "ggml-metal.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +// This is a simple model with two tensors a and b +struct simple_model { + struct ggml_tensor * a; + struct ggml_tensor * b; + + // the backend to perform the computation (CPU, CUDA, METAL) + ggml_backend_t backend = NULL; + + // the backend buffer to storage the tensors data of a and b + ggml_backend_buffer_t buffer; + + // the context to define the tensor information (dimensions, size, memory address) + struct ggml_context * ctx; +}; + +// initialize the tensors of the model in this case two matrices 2x2 +void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B) { + // initialize the backend +#ifdef GGML_USE_CUBLAS + fprintf(stderr, "%s: using CUDA backend\n", __func__); + model.backend = ggml_backend_cuda_init(0); // init device 0 + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__); + } +#endif + +#ifdef GGML_USE_METAL + fprintf(stderr, "%s: using Metal backend\n", __func__); + ggml_metal_log_set_callback(ggml_log_callback_default, nullptr); + model.backend = ggml_backend_metal_init(); + if (!model.backend) { + fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__); + } +#endif + + // if there aren't GPU Backends fallback to CPU backend + if (!model.backend) { + model.backend = ggml_backend_cpu_init(); + } + + int num_tensors = 2; + + struct ggml_init_params params { + /*.mem_size =*/ ggml_tensor_overhead() * num_tensors, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ true, + }; + + // create context + model.ctx = ggml_init(params); + + // create tensors + model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A); + model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B); + + // create a backend buffer (backend memory) and alloc the tensors from the context + model.buffer = ggml_backend_alloc_ctx_tensors(model.ctx, model.backend); + + // load data from cpu memory to backend buffer + ggml_backend_tensor_set(model.a, a, 0, ggml_nbytes(model.a)); + ggml_backend_tensor_set(model.b, b, 0, ggml_nbytes(model.b)); +} + +// build the compute graph to perform a matrix multiplication +struct ggml_cgraph * build_graph(const simple_model& model) { + static size_t buf_size = ggml_tensor_overhead()*GGML_DEFAULT_GRAPH_SIZE + ggml_graph_overhead(); + static std::vector buf(buf_size); + + struct ggml_init_params params0 = { + /*.mem_size =*/ buf_size, + /*.mem_buffer =*/ buf.data(), + /*.no_alloc =*/ true, // the tensors will be allocated later by ggml_allocr_alloc_graph() + }; + + // create a temporally context to build the graph + struct ggml_context * ctx0 = ggml_init(params0); + + struct ggml_cgraph * gf = ggml_new_graph(ctx0); + + // result = a*b^T + struct ggml_tensor * result = ggml_mul_mat(ctx0, model.a, model.b); + + // build operations nodes + ggml_build_forward_expand(gf, result); + + // delete the temporally context used to build the graph + ggml_free(ctx0); + return gf; +} + +// compute with backend +struct ggml_tensor * compute(const simple_model & model, ggml_gallocr_t allocr) { + // reset the allocator to free all the memory allocated during the previous inference + + struct ggml_cgraph * gf = build_graph(model); + + // allocate tensors + ggml_gallocr_alloc_graph(allocr, gf); + + int n_threads = 1; // number of threads to perform some operations with multi-threading + + if (ggml_backend_is_cpu(model.backend)) { + ggml_backend_cpu_set_n_threads(model.backend, n_threads); + } + +#ifdef GGML_USE_METAL + if (ggml_backend_is_metal(model.backend)) { + ggml_backend_metal_set_n_cb(model.backend, n_threads); + } +#endif + + ggml_backend_graph_compute(model.backend, gf); + + // in this case, the output tensor is the last one in the graph + return gf->nodes[gf->n_nodes - 1]; +} + +int main(void) { + ggml_time_init(); + + // initialize data of matrices to perform matrix multiplication + const int rows_A = 4, cols_A = 2; + + float matrix_A[rows_A * cols_A] = { + 2, 8, + 5, 1, + 4, 2, + 8, 6 + }; + + const int rows_B = 3, cols_B = 2; + /* Transpose([ + 10, 9, 5, + 5, 9, 4 + ]) 2 rows, 3 cols */ + float matrix_B[rows_B * cols_B] = { + 10, 5, + 9, 9, + 5, 4 + }; + + simple_model model; + load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B); + + // calculate the temporaly memory required to compute + ggml_gallocr_t allocr = NULL; + + { + allocr = ggml_gallocr_new(ggml_backend_get_default_buffer_type(model.backend)); + + // create the worst case graph for memory usage estimation + struct ggml_cgraph * gf = build_graph(model); + ggml_gallocr_reserve(allocr, gf); + size_t mem_size = ggml_gallocr_get_buffer_size(allocr, 0); + + fprintf(stderr, "%s: compute buffer size: %.4f KB\n", __func__, mem_size/1024.0); + } + + // perform computation + struct ggml_tensor * result = compute(model, allocr); + + // create a array to print result + std::vector out_data(ggml_nelements(result)); + + // bring the data from the backend memory + ggml_backend_tensor_get(result, out_data.data(), 0, ggml_nbytes(result)); + + // expected result: + // [ 60.00 110.00 54.00 29.00 + // 55.00 90.00 126.00 28.00 + // 50.00 54.00 42.00 64.00 ] + + printf("mul mat (%d x %d) (transposed result):\n[", result->ne[0], result->ne[1]); + for (int j = 0; j < result->ne[1] /* rows */; j++) { + if (j > 0) { + printf("\n"); + } + + for (int i = 0; i < result->ne[0] /* cols */; i++) { + printf(" %.2f", out_data[i * result->ne[1] + j]); + } + } + printf(" ]\n"); + + // release backend memory used for computation + ggml_gallocr_free(allocr); + + // free memory + ggml_free(model.ctx); + + // release backend memory and free backend + ggml_backend_buffer_free(model.buffer); + ggml_backend_free(model.backend); + return 0; +} diff --git a/examples/simple/simple-ctx.cpp b/examples/simple/simple-ctx.cpp new file mode 100644 index 00000000..e8410010 --- /dev/null +++ b/examples/simple/simple-ctx.cpp @@ -0,0 +1,126 @@ +#include "ggml.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +// This is a simple model with two tensors a and b +struct simple_model { + struct ggml_tensor * a; + struct ggml_tensor * b; + + // the context to define the tensor information (dimensions, size, memory data) + struct ggml_context * ctx; +}; + +// initialize the tensors of the model in this case two matrices 2x2 +void load_model(simple_model & model, float * a, float * b, int rows_A, int cols_A, int rows_B, int cols_B) { + size_t ctx_size = 0; + { + ctx_size += rows_A * cols_A * ggml_type_size(GGML_TYPE_F32); // tensor a + ctx_size += rows_B * cols_B * ggml_type_size(GGML_TYPE_F32); // tensor b + ctx_size += 2 * ggml_tensor_overhead(), // tensors + ctx_size += ggml_graph_overhead(); // compute graph + ctx_size += 1024; // some overhead + } + + struct ggml_init_params params { + /*.mem_size =*/ ctx_size, + /*.mem_buffer =*/ NULL, + /*.no_alloc =*/ false, // NOTE: this should be false when using the legacy API + }; + + // create context + model.ctx = ggml_init(params); + + // create tensors + model.a = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_A, rows_A); + model.b = ggml_new_tensor_2d(model.ctx, GGML_TYPE_F32, cols_B, rows_B); + + memcpy(model.a->data, a, ggml_nbytes(model.a)); + memcpy(model.b->data, b, ggml_nbytes(model.b)); +} + +// build the compute graph to perform a matrix multiplication +struct ggml_cgraph * build_graph(const simple_model& model) { + struct ggml_cgraph * gf = ggml_new_graph(model.ctx); + + // result = a*b^T + struct ggml_tensor * result = ggml_mul_mat(model.ctx, model.a, model.b); + + ggml_build_forward_expand(gf, result); + return gf; +} + +// compute with backend +struct ggml_tensor * compute(const simple_model & model) { + struct ggml_cgraph * gf = build_graph(model); + + int n_threads = 1; // number of threads to perform some operations with multi-threading + + ggml_graph_compute_with_ctx(model.ctx, gf, n_threads); + + // in this case, the output tensor is the last one in the graph + return gf->nodes[gf->n_nodes - 1]; +} + +int main(void) { + ggml_time_init(); + + // initialize data of matrices to perform matrix multiplication + const int rows_A = 4, cols_A = 2; + + float matrix_A[rows_A * cols_A] = { + 2, 8, + 5, 1, + 4, 2, + 8, 6 + }; + + const int rows_B = 3, cols_B = 2; + /* Transpose([ + 10, 9, 5, + 5, 9, 4 + ]) 2 rows, 3 cols */ + float matrix_B[rows_B * cols_B] = { + 10, 5, + 9, 9, + 5, 4 + }; + + simple_model model; + load_model(model, matrix_A, matrix_B, rows_A, cols_A, rows_B, cols_B); + + // perform computation in cpu + struct ggml_tensor * result = compute(model); + + // get the result data pointer as a float array to print + std::vector out_data(ggml_nelements(result)); + memcpy(out_data.data(), result->data, ggml_nbytes(result)); + + // expected result: + // [ 60.00 110.00 54.00 29.00 + // 55.00 90.00 126.00 28.00 + // 50.00 54.00 42.00 64.00 ] + + printf("mul mat (%d x %d) (transposed result):\n[", result->ne[0], result->ne[1]); + for (int j = 0; j < result->ne[1] /* rows */; j++) { + if (j > 0) { + printf("\n"); + } + + for (int i = 0; i < result->ne[0] /* cols */; i++) { + printf(" %.2f", out_data[i * result->ne[1] + j]); + } + } + printf(" ]\n"); + + // free memory + ggml_free(model.ctx); + return 0; +}