add_executable(${TEST_TARGET} main.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml common)
+#
+# mnist-cpu
+
+set(TEST_TARGET mnist-cpu)
+add_executable(${TEST_TARGET} main-cpu.cpp)
+target_link_libraries(${TEST_TARGET} PRIVATE ggml)
+
+if (APPLE)
+ #
+ # mnist-mtl
+
+ find_library(FOUNDATION_LIBRARY Foundation REQUIRED)
+ find_library(METAL_FRAMEWORK Metal REQUIRED)
+ find_library(METALKIT_FRAMEWORK MetalKit REQUIRED)
+ find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED)
+
+ set(TEST_TARGET mnist-mtl)
+ add_executable(${TEST_TARGET} main-mtl.cpp main-mtl.h main-mtl.m)
+ target_link_libraries(${TEST_TARGET} PRIVATE
+ ggml
+ ${FOUNDATION_LIBRARY}
+ ${METAL_FRAMEWORK}
+ ${METALKIT_FRAMEWORK}
+ ${METALPERFORMANCE_FRAMEWORK}
+ )
+endif()
--- /dev/null
+// Use a pre-generated MNIST compute graph for inference on the CPU
+//
+// You can generate a compute graph using the "mnist" tool:
+//
+// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+// This command creates the "mnist.ggml" file, which contains the generated compute graph.
+// Now, you can re-use the compute graph with the "mnist-cpu" tool:
+//
+// $ ./bin/mnist-cpu ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+
+#include "ggml/ggml.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <vector>
+
+// evaluate the MNIST compute graph
+//
+// - fname_cgraph: path to the compute graph
+// - n_threads: number of threads to use
+// - digit: 784 pixel values
+//
+// returns 0 - 9 prediction
+int mnist_eval(
+ const char * fname_cgraph,
+ const int n_threads,
+ std::vector<float> digit
+ ) {
+ // load the compute graph
+ struct ggml_context * ctx_data = NULL;
+ struct ggml_context * ctx_eval = NULL;
+
+ struct ggml_cgraph gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
+ gfi.n_threads = n_threads;
+
+ // allocate eval context
+ // needed during ggml_graph_compute() to allocate a work tensor
+ static size_t buf_size = gfi.work_size; // TODO
+ static void * buf = malloc(buf_size);
+
+ struct ggml_init_params params = {
+ .mem_size = buf_size,
+ .mem_buffer = buf,
+ .no_alloc = false,
+ };
+
+ struct ggml_context * ctx0 = ggml_init(params);
+
+ struct ggml_tensor * input = ggml_graph_get_tensor(&gfi, "input");
+ memcpy(input->data, digit.data(), ggml_nbytes(input));
+
+ ggml_graph_compute(ctx0, &gfi);
+
+ const float * probs_data = ggml_get_data_f32(ggml_graph_get_tensor(&gfi, "probs"));
+
+ const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
+
+ ggml_free(ctx0);
+ ggml_free(ctx_data);
+ ggml_free(ctx_eval);
+
+ return prediction;
+}
+
+int main(int argc, char ** argv) {
+ srand(time(NULL));
+ ggml_time_init();
+
+ if (argc != 3) {
+ fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
+ exit(0);
+ }
+
+ uint8_t buf[784];
+ std::vector<float> digit;
+
+ // read a random digit from the test set
+ {
+ std::ifstream fin(argv[2], std::ios::binary);
+ if (!fin) {
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
+ return 1;
+ }
+
+ // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
+ fin.seekg(16 + 784 * (rand() % 10000));
+ fin.read((char *) &buf, sizeof(buf));
+ }
+
+ // render the digit in ASCII
+ {
+ digit.resize(sizeof(buf));
+
+ for (int row = 0; row < 28; row++) {
+ for (int col = 0; col < 28; col++) {
+ fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
+ digit[row*28 + col] = ((float)buf[row*28 + col]);
+ }
+
+ fprintf(stderr, "\n");
+ }
+
+ fprintf(stderr, "\n");
+ }
+
+ const int prediction = mnist_eval(argv[1], 1, digit);
+
+ fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
+
+ return 0;
+}
--- /dev/null
+// Use a pre-generated MNIST compute graph for inference on the M1 GPU via MPS
+//
+// You can generate a compute graph using the "mnist" tool:
+//
+// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+// This command creates the "mnist.ggml" file, which contains the generated compute graph.
+// Now, you can re-use the compute graph on the GPU with the "mnist-mtl" tool:
+//
+// $ ./bin/mnist-mtl ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte
+//
+
+#include "ggml/ggml.h"
+
+#include "main-mtl.h"
+
+#include <cmath>
+#include <cstdio>
+#include <cstring>
+#include <ctime>
+#include <fstream>
+#include <vector>
+
+// evaluate the MNIST compute graph
+//
+// - fname_cgraph: path to the compute graph
+// - n_threads: number of threads to use
+// - digit: 784 pixel values
+//
+// returns 0 - 9 prediction
+int mnist_eval(
+ const char * fname_cgraph,
+ const int n_threads,
+ std::vector<float> digit
+ ) {
+ // load the compute graph
+ struct ggml_context * ctx_data = NULL;
+ struct ggml_context * ctx_eval = NULL;
+
+ struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval);
+ gf.n_threads = n_threads;
+
+ // allocate eval context
+ // needed during ggml_graph_compute() to allocate a work tensor
+ static size_t buf_size = gf.work_size; // TODO
+ static void * buf = malloc(buf_size);
+
+ struct ggml_init_params params = {
+ .mem_size = buf_size,
+ .mem_buffer = buf,
+ .no_alloc = false,
+ };
+
+ struct ggml_context * ctx_work = ggml_init(params);
+
+ // this allocates all Metal resources and memory buffers
+ auto ctx_mtl = mnist_mtl_init(ctx_data, ctx_eval, ctx_work, &gf);
+
+ int prediction = -1;
+
+ for (int i = 0; i < 1; ++i) {
+ struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "input");
+
+ if (i % 2 == 0) {
+ memcpy(input->data, digit.data(), ggml_nbytes(input));
+ } else {
+ memset(input->data, 0, ggml_nbytes(input));
+ }
+
+ // the actual inference happens here
+ prediction = mnist_mtl_eval(ctx_mtl, &gf);
+ }
+
+ mnist_mtl_free(ctx_mtl);
+
+ ggml_free(ctx_work);
+ ggml_free(ctx_data);
+ ggml_free(ctx_eval);
+
+ return prediction;
+}
+
+int main(int argc, char ** argv) {
+ srand(time(NULL));
+ ggml_time_init();
+
+ if (argc != 3) {
+ fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]);
+ exit(0);
+ }
+
+ uint8_t buf[784];
+ std::vector<float> digit;
+
+ // read a random digit from the test set
+ {
+ std::ifstream fin(argv[2], std::ios::binary);
+ if (!fin) {
+ fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]);
+ return 1;
+ }
+
+ // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
+ fin.seekg(16 + 784 * (rand() % 10000));
+ fin.read((char *) &buf, sizeof(buf));
+ }
+
+ // render the digit in ASCII
+ {
+ digit.resize(sizeof(buf));
+
+ for (int row = 0; row < 28; row++) {
+ for (int col = 0; col < 28; col++) {
+ fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_');
+ digit[row*28 + col] = ((float)buf[row*28 + col]);
+ }
+
+ fprintf(stderr, "\n");
+ }
+
+ fprintf(stderr, "\n");
+ }
+
+ const int prediction = mnist_eval(argv[1], 1, digit);
+
+ fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
+
+ return 0;
+}
--- /dev/null
+#pragma once
+
+struct ggml_context;
+struct ggml_cgraph;
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+struct ggml_mtl_context;
+
+struct ggml_mtl_context * mnist_mtl_init(
+ struct ggml_context * ctx_data,
+ struct ggml_context * ctx_eval,
+ struct ggml_context * ctx_work,
+ struct ggml_cgraph * gf);
+
+void mnist_mtl_free(struct ggml_mtl_context * ctx);
+
+int mnist_mtl_eval(
+ struct ggml_mtl_context * ctx,
+ struct ggml_cgraph * gf);
+
+#ifdef __cplusplus
+}
+#endif
--- /dev/null
+#import "main-mtl.h"
+
+#import "ggml/ggml.h"
+
+#import <Foundation/Foundation.h>
+#import <Metal/Metal.h>
+#import <MetalPerformanceShaders/MetalPerformanceShaders.h>
+
+// TODO: couldn't get this to work
+//#define GGML_MTL_HEAP
+
+struct ggml_mtl_context {
+ struct ggml_context * ctx_data;
+ struct ggml_context * ctx_eval;
+ struct ggml_context * ctx_work;
+
+ id<MTLDevice> device;
+ id<MTLCommandQueue> queue;
+ id<MTLLibrary> library;
+
+#ifdef GGML_MTL_HEAP
+ id<MTLHeap> heap_data;
+ id<MTLHeap> heap_eval;
+#else
+ id<MTLBuffer> buffer_data;
+ id<MTLBuffer> buffer_eval;
+#endif
+
+ id<MTLBuffer> out;
+
+ // custom kernels
+ id<MTLFunction> function_add;
+ id<MTLComputePipelineState> pipeline_add;
+
+ id<MTLFunction> function_relu;
+ id<MTLComputePipelineState> pipeline_relu;
+
+ id<MTLFunction> function_soft_max;
+ id<MTLComputePipelineState> pipeline_soft_max;
+};
+
+// MSL code
+NSString * const msl_library_mnist = @"\
+#include <metal_stdlib> \n\
+using namespace metal; \n\
+ \n\
+#define MAX(x, y) ((x) > (y) ? (x) : (y)) \n\
+ \n\
+constant int k_digits [[function_constant(0)]]; \n\
+ \n\
+kernel void kernel_add( \n\
+ device const float * src0, \n\
+ device const float * src1, \n\
+ device float * dst, \n\
+ uint gid[[thread_position_in_grid]]) { \n\
+ dst[gid] = src0[gid] + src1[gid]; \n\
+} \n\
+ \n\
+kernel void kernel_relu( \n\
+ device const float * src, \n\
+ device float * dst, \n\
+ uint gid[[thread_position_in_grid]]) { \n\
+ dst[gid] = max(0.0f, src[gid]); \n\
+} \n\
+ \n\
+kernel void kernel_soft_max( \n\
+ device const float * src, \n\
+ device float * dst, \n\
+ uint gid[[thread_position_in_grid]]) { \n\
+ float max = 0.0f; \n\
+ for (int i = 0; i < k_digits; i++) { \n\
+ max = MAX(max, src[i]); \n\
+ } \n\
+ float sum = 0.0f; \n\
+ for (int i = 0; i < k_digits; i++) { \n\
+ dst[i] = exp(src[i] - max); \n\
+ sum += dst[i]; \n\
+ } \n\
+ for (int i = 0; i < k_digits; i++) { \n\
+ dst[i] /= sum; \n\
+ } \n\
+} \n\
+";
+
+struct ggml_mtl_context * mnist_mtl_init(
+ struct ggml_context * ctx_data,
+ struct ggml_context * ctx_eval,
+ struct ggml_context * ctx_work,
+ struct ggml_cgraph * gf) {
+ fprintf(stderr, "%s: allocating\n", __func__);
+
+ struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context));
+
+ ctx->ctx_data = ctx_data;
+ ctx->ctx_eval = ctx_eval;
+ ctx->ctx_work = ctx_work;
+
+ ctx->device = MTLCreateSystemDefaultDevice();
+ ctx->queue = [ctx->device newCommandQueue];
+
+ // determine if we can use MPS
+ if (MPSSupportsMTLDevice(ctx->device)) {
+ fprintf(stderr, "%s: using MPS\n", __func__);
+ } else {
+ fprintf(stderr, "%s: not using MPS\n", __func__);
+ GGML_ASSERT(false && "MPS not supported");
+ }
+
+ // compile from source string and show compile log
+ {
+ NSError * error = nil;
+ ctx->library = [ctx->device newLibraryWithSource:msl_library_mnist options:nil error:&error];
+ if (error) {
+ fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]);
+ exit(1);
+ }
+ }
+
+ // load kernels
+ {
+ const int k_digits = ggml_graph_get_tensor(gf, "probs")->ne[0];
+
+ MTLFunctionConstantValues * constants = [MTLFunctionConstantValues new];
+ [constants setConstantValue:&k_digits type:MTLDataTypeInt withName:@"k_digits"];
+
+ ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"];
+ ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil];
+ fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, ctx->pipeline_add);
+
+ ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"];
+ ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil];
+ fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, ctx->pipeline_relu);
+
+ ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil];
+ ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil];
+ fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, ctx->pipeline_soft_max);
+ }
+
+#ifdef GGML_MTL_HEAP
+ // MTLHeap approach
+
+ // pin ctx_data memory to GPU
+ // use MTLStorageModeShared to allow us to initialize the weights from the CPU
+ // TODO: how to use MTLStorageModeManaged?
+ // TODO: see if we can avoid this copy somehow
+ {
+ const void * mem_buffer = ggml_get_mem_buffer(ctx_data);
+ const size_t mem_size = ggml_get_mem_size(ctx_data);
+
+ MTLHeapDescriptor * heap_desc = [MTLHeapDescriptor new];
+ heap_desc.storageMode = MTLStorageModeShared;
+ heap_desc.size = mem_size;
+
+ printf("heap_desc.size = %zu\n", mem_size);
+
+ ctx->heap_data = [ctx->device newHeapWithDescriptor:heap_desc];
+ [ctx->heap_data setPurgeableState:MTLPurgeableStateNonVolatile]; // TODO: is this needed?
+ ctx->heap_data.label = @"heap_data";
+
+ printf("ctx->heap_data.size = %zu\n", [ctx->heap_data size]);
+
+ id<MTLBuffer> buffer = [ctx->heap_data newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
+ if (!buffer) {
+ fprintf(stderr, "%s: error: failed to allocate buffer\n", __func__);
+ exit(1);
+ }
+
+ // copy data from CPU to GPU
+ memcpy([buffer contents], mem_buffer, mem_size);
+
+ fprintf(stderr, "%s: allocated data heap, size = %zu\n", __func__, mem_size);
+ }
+
+ // pin ctx_eval memory to GPU
+ // this heap will be used for the intermediate results of the evaluation
+ {
+ const size_t mem_size = ggml_get_mem_size(ctx_eval);
+
+ MTLHeapDescriptor * heap_desc = [MTLHeapDescriptor new];
+ heap_desc.storageMode = MTLStorageModePrivate; // GPU only
+ heap_desc.size = mem_size;
+
+ ctx->heap_eval = [ctx->device newHeapWithDescriptor:heap_desc];
+ [ctx->heap_eval setPurgeableState:MTLPurgeableStateNonVolatile]; // TODO: is this needed?
+
+ fprintf(stderr, "%s: allocated eval heap, size = %zu\n", __func__, mem_size);
+ }
+#else
+ // MTLBuffer approach
+
+ // pin ctx_data memory to GPU
+ // use MTLStorageModeShared to allow us to initialize the weights from the CPU
+ // TODO: how to use MTLStorageModeManaged?
+ // TODO: see if we can avoid this copy somehow
+ {
+ const void * mem_buffer = ggml_get_mem_buffer(ctx_data);
+ const size_t mem_size = ggml_get_mem_size(ctx_data);
+
+ ctx->buffer_data = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared];
+
+ fprintf(stderr, "%s: allocated data buffer, size = %zu\n", __func__, mem_size);
+ }
+
+ // pin ctx_eval memory to GPU
+ // this buffer will be used for the intermediate results of the evaluation
+ {
+ const size_t mem_size = ggml_get_mem_size(ctx_eval);
+
+ ctx->buffer_eval = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModePrivate];
+
+ fprintf(stderr, "%s: allocated eval buffer, size = %zu\n", __func__, mem_size);
+ }
+#endif
+
+ // allocate buffer for result extraction
+ {
+ const size_t mem_size = ggml_nbytes(gf->nodes[gf->n_nodes - 1]);
+
+ ctx->out = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared];
+
+ fprintf(stderr, "%s: allocated out buffer, size = %zu\n", __func__, mem_size);
+ }
+
+ return ctx;
+}
+
+void mnist_mtl_free(struct ggml_mtl_context * ctx) {
+ fprintf(stderr, "%s: deallocating\n", __func__);
+
+ free(ctx);
+}
+
+#ifdef GGML_MTL_HEAP
+
+// make a view of the respective MTL heap
+id<MTLBuffer> mnist_mtl_get_buffer_on_heap(struct ggml_mtl_context * ctx, struct ggml_tensor * t) {
+ const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data);
+ const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval);
+
+ const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval);
+
+ const size_t t_size = ggml_nbytes(t);
+ const size_t t_offs = is_data ? offs_data : offs_eval;
+
+ id<MTLBuffer> result;
+
+ if (is_data) {
+ fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+ result = [ctx->heap_data newBufferWithLength:t_size options:MTLResourceStorageModeShared offset:t_offs];
+ } else {
+ fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+ result = [ctx->heap_eval newBufferWithLength:t_size options:MTLResourceStorageModePrivate offset:t_offs];
+ }
+
+ if (result == nil) {
+ fprintf(stderr, "%s: error: buffer is nil\n", __func__);
+ GGML_ASSERT(false);
+ }
+
+ return result;
+}
+
+#else
+
+// get data / eval buffer + offset
+id<MTLBuffer> mnist_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) {
+ const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data);
+ const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval);
+
+ const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval);
+
+ const size_t t_size = ggml_nbytes(t);
+ const size_t t_offs = is_data ? offs_data : offs_eval;
+
+ id<MTLBuffer> result;
+
+ if (is_data) {
+ fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+ result = ctx->buffer_data;
+ } else {
+ fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size);
+ result = ctx->buffer_eval;
+ }
+
+ if (result == nil) {
+ fprintf(stderr, "%s: error: buffer is nil\n", __func__);
+ GGML_ASSERT(false);
+ }
+
+ if (offs != nil) {
+ *offs = t_offs;
+ }
+
+ return result;
+}
+
+#endif
+
+int mnist_mtl_eval(
+ struct ggml_mtl_context * ctx,
+ struct ggml_cgraph * gf) {
+ fprintf(stderr, "%s: evaluating\n", __func__);
+
+ id<MTLCommandBuffer> command_buffer = [ctx->queue commandBuffer];
+ id<MTLComputeCommandEncoder> encoder = nil;
+
+ size_t offs_src0;
+ size_t offs_src1;
+ size_t offs_dst;
+
+ // copy the input data to the GPU
+ {
+ struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "input");
+
+ id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, inp, &offs_src0);
+
+ memcpy(id_dst.contents + offs_src0, inp->data, ggml_nbytes(inp));
+ }
+
+ for (int i = 0; i < gf->n_nodes; ++i) {
+ fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+
+ switch (gf->nodes[i]->op) {
+ case GGML_OP_ADD:
+ {
+ if (encoder == nil) {
+ encoder = [command_buffer computeCommandEncoder];
+ }
+
+ id<MTLBuffer> id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+ id<MTLBuffer> id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
+ id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i], &offs_dst);
+
+ [encoder setComputePipelineState:ctx->pipeline_add];
+ [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0];
+ [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1];
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:2];
+
+ const int64_t n = ggml_nelements(gf->nodes[i]);
+
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+ } break;
+ case GGML_OP_RELU:
+ {
+ if (encoder == nil) {
+ encoder = [command_buffer computeCommandEncoder];
+ }
+
+ id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+ id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i], &offs_dst);
+
+ [encoder setComputePipelineState:ctx->pipeline_relu];
+ [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
+
+ const int64_t n = ggml_nelements(gf->nodes[i]);
+
+ [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+ } break;
+ case GGML_OP_SOFT_MAX:
+ {
+#if 0
+ // NOTE: MPSMatrixSoftMax is not working properly, probably there is a bug
+
+ if (encoder != nil) {
+ [encoder endEncoding];
+ encoder = nil;
+ }
+
+ // use MPSMatrixSoftMax
+ id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+ id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i], &offs_dst);
+
+ MPSMatrixDescriptor * desc = [MPSMatrixDescriptor
+ matrixDescriptorWithRows:1 columns:gf->nodes[i]->ne[0] rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
+
+ MPSMatrix * mat_src = [[MPSMatrix alloc] initWithBuffer:id_src offset:offs_src0 descriptor:desc];
+ MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst descriptor:desc];
+
+ MPSMatrixSoftMax * softmax = [[MPSMatrixSoftMax alloc] initWithDevice:ctx->device];
+
+ [softmax encodeToCommandBuffer:command_buffer inputMatrix:mat_src resultMatrix:mat_dst];
+#else
+ if (encoder == nil) {
+ encoder = [command_buffer computeCommandEncoder];
+ }
+
+ id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+ id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i], &offs_dst);
+
+ [encoder setComputePipelineState:ctx->pipeline_soft_max];
+ [encoder setBuffer:id_src offset:offs_src0 atIndex:0];
+ [encoder setBuffer:id_dst offset:offs_dst atIndex:1];
+
+ [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)];
+#endif
+ } break;
+ case GGML_OP_MUL_MAT:
+ {
+ if (encoder != nil) {
+ [encoder endEncoding];
+ encoder = nil;
+ }
+
+ // use MPSMatrixMultiplication
+ id<MTLBuffer> id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0);
+ id<MTLBuffer> id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1);
+ id<MTLBuffer> id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i], &offs_dst);
+
+ const int64_t ncols0 = gf->nodes[i]->src0->ne[0];
+ const int64_t nrows0 = gf->nodes[i]->src0->ne[1];
+
+ const int64_t ncols1 = gf->nodes[i]->src1->ne[0];
+ const int64_t nrows1 = gf->nodes[i]->src1->ne[1];
+
+ const int64_t ncols2 = gf->nodes[i]->ne[0];
+ const int64_t nrows2 = gf->nodes[i]->ne[1];
+
+ GGML_ASSERT(ncols0 == ncols1);
+
+ MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor
+ matrixDescriptorWithRows:nrows0 columns:ncols0 rowBytes:gf->nodes[i]->src0->nb[1] dataType:MPSDataTypeFloat32];
+ MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor
+ matrixDescriptorWithRows:nrows1 columns:ncols1 rowBytes:gf->nodes[i]->src1->nb[1] dataType:MPSDataTypeFloat32];
+ MPSMatrixDescriptor * desc2 = [MPSMatrixDescriptor
+ matrixDescriptorWithRows:nrows2 columns:ncols2 rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32];
+
+ MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0 descriptor:desc0];
+ MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1 descriptor:desc1];
+ MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst descriptor:desc2];
+
+ MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc] initWithDevice:ctx->device
+ transposeLeft:false transposeRight:true resultRows:nrows1 resultColumns:nrows0 interiorColumns:ncols0 alpha:1.0 beta:0.0];
+
+ [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst];
+ } break;
+ default:
+ fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op));
+ GGML_ASSERT(false);
+ return -1;
+ }
+ }
+
+ // extract results from the GPU
+ {
+ if (encoder != nil) {
+ [encoder endEncoding];
+ encoder = nil;
+ }
+
+ struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1];
+
+ id<MTLBuffer> id_src = mnist_mtl_get_buffer(ctx, out, &offs_src0);
+ id<MTLBuffer> id_dst = ctx->out;
+
+ id<MTLBlitCommandEncoder> encoder_blit = [command_buffer blitCommandEncoder];
+ [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(out)];
+ [encoder_blit endEncoding];
+ }
+
+ [command_buffer commit];
+ [command_buffer waitUntilCompleted];
+
+ {
+ const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime];
+ fprintf(stderr, "%s: time elapsed = %f\n", __func__, time_elapsed);
+ }
+
+ // select the most probable digit
+ int result = -1;
+ {
+ const float * probs = ctx->out.contents;
+
+ float prob = probs[0];
+
+ for (int i = 0; i < 10; ++i) {
+ fprintf(stderr, "%s: probs[%2d] = %f\n", __func__, i, probs[i]);
+
+ if (probs[i] > prob) {
+ result = i;
+ prob = probs[i];
+ }
+ }
+ }
+
+ return result;
+}
int mnist_eval(
const mnist_model & model,
const int n_threads,
- std::vector<float> digit
+ std::vector<float> digit,
+ const char * fname_cgraph
) {
const auto & hparams = model.hparams;
struct ggml_init_params params = {
.mem_size = buf_size,
.mem_buffer = buf,
+ .no_alloc = false,
};
struct ggml_context * ctx0 = ggml_init(params);
// soft max
ggml_tensor * probs = ggml_soft_max(ctx0, fc2);
+ ggml_set_name(probs, "probs");
- // run the computation
+ // build / export / run the computation graph
ggml_build_forward_expand(&gf, probs);
ggml_graph_compute (ctx0, &gf);
//ggml_graph_print (&gf);
ggml_graph_dump_dot(&gf, NULL, "mnist.dot");
+ if (fname_cgraph) {
+ // export the compute graph for later use
+ // see the "mnist-cpu" example
+ ggml_graph_export(&gf, "mnist.ggml");
+
+ fprintf(stderr, "%s: exported compute graph to '%s'\n", __func__, fname_cgraph);
+ }
+
const float * probs_data = ggml_get_data_f32(probs);
const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data;
extern "C" {
#endif
-int wasm_eval(uint8_t *digitPtr)
-{
+int wasm_eval(uint8_t * digitPtr) {
mnist_model model;
if (!mnist_model_load("models/mnist/ggml-model-f32.bin", model)) {
fprintf(stderr, "error loading model\n");
return -1;
}
std::vector<float> digit(digitPtr, digitPtr + 784);
- int result = mnist_eval(model, 1, digit);
+ int result = mnist_eval(model, 1, digit, nullptr);
ggml_free(model.ctx);
+
return result;
}
-int wasm_random_digit(char *digitPtr)
-{
+int wasm_random_digit(char * digitPtr) {
auto fin = std::ifstream("models/mnist/t10k-images.idx3-ubyte", std::ios::binary);
if (!fin) {
fprintf(stderr, "failed to open digits file\n");
return 0;
}
srand(time(NULL));
+
// Seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000)
fin.seekg(16 + 784 * (rand() % 10000));
fin.read(digitPtr, 784);
+
return 1;
}
fprintf(stderr, "\n");
}
- fprintf(stdout, "%s: predicted digit is %d\n", __func__, mnist_eval(model, 1, digit));
+ const int prediction = mnist_eval(model, 1, digit, "mnist.ggml");
+
+ fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction);
ggml_free(model.ctx);
GGML_API size_t ggml_used_mem(const struct ggml_context * ctx);
- GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch);
+ GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch);
GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc);
+ GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx);
+ GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx);
+
GGML_API struct ggml_tensor * ggml_new_tensor(
struct ggml_context * ctx,
enum ggml_type type,
GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src);
GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src);
+ GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name);
+
GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor);
GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value);
GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value);
GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph);
GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph);
- GGML_API struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name);
+ GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name);
+
+ GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname);
+ GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval);
// print info and performance information for the graph
GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph);
ctx->no_alloc = no_alloc;
}
+void * ggml_get_mem_buffer(struct ggml_context * ctx) {
+ return ctx->mem_buffer;
+}
+
+size_t ggml_get_mem_size(struct ggml_context * ctx) {
+ return ctx->mem_size;
+}
+
// IMPORTANT:
// when creating "opt" tensors, always save and load the scratch buffer
// this is an error prone process, but it is necessary to support inplace
return result;
}
+struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) {
+ struct ggml_object * obj = ctx->objects_begin;
+
+ char * const mem_buffer = ctx->mem_buffer;
+
+ while (obj != NULL) {
+ struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs);
+ if (strcmp(cur->name, name) == 0) {
+ return cur;
+ }
+
+ obj = obj->next;
+ }
+
+ return NULL;
+}
+
////////////////////////////////////////////////////////////////////////////////
// ggml_dup
}
}
-struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) {
+struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) {
for (int i = 0; i < cgraph->n_leafs; i++) {
struct ggml_tensor * leaf = cgraph->leafs[i];
return NULL;
}
+static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) {
+ const int64_t * ne = tensor->ne;
+ const size_t * nb = tensor->nb;
+
+ fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n",
+ ggml_type_name(tensor->type),
+ ggml_op_name (tensor->op),
+ tensor->n_dims,
+ ne[0], ne[1], ne[2], ne[3],
+ nb[0], nb[1], nb[2], nb[3],
+ tensor->data,
+ tensor->name);
+}
+
+static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) {
+ const int64_t * ne = tensor->ne;
+ const size_t * nb = tensor->nb;
+
+ fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n",
+ arg,
+ ggml_type_name(tensor->type),
+ ggml_op_name (tensor->op),
+ tensor->n_dims,
+ ne[0], ne[1], ne[2], ne[3],
+ nb[0], nb[1], nb[2], nb[3],
+ tensor->n_tasks,
+ tensor->data,
+ tensor->name);
+}
+
+void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) {
+ assert(cgraph->work == NULL);
+ assert(cgraph->work_size == 0);
+
+ uint64_t size_eval = 0;
+
+ // compute size of intermediate results
+ // TODO: does not take into account scratch buffers !!!!
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
+ size_eval += ggml_nbytes(cgraph->nodes[i]);
+ }
+
+ // print
+ {
+ FILE * fout = stdout;
+
+ fprintf(fout, "\n");
+ fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC);
+ fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION);
+ fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs);
+ fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes);
+ fprintf(fout, "%-16s %8llu\n", "eval", size_eval);
+
+ // header
+ fprintf(fout, "\n");
+ fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n",
+ "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME");
+
+ for (int i = 0; i < cgraph->n_leafs; ++i) {
+ ggml_graph_export_leaf(cgraph->leafs[i], fout);
+
+ GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE);
+ GGML_ASSERT(cgraph->leafs[i]->src0 == NULL);
+ GGML_ASSERT(cgraph->leafs[i]->src1 == NULL);
+ }
+
+ // header
+ fprintf(fout, "\n");
+ fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n",
+ "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME");
+
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
+ ggml_graph_export_node(cgraph->nodes[i], "DST", fout);
+
+ if (cgraph->nodes[i]->src0) {
+ ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout);
+ }
+
+ if (cgraph->nodes[i]->src1) {
+ ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout);
+ }
+
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
+ if (cgraph->nodes[i]->opt[j]) {
+ ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout);
+ }
+ }
+
+ fprintf(fout, "\n");
+ }
+
+ fprintf(fout, "\n");
+ }
+
+ // write binary data
+ {
+ FILE * fout = fopen(fname, "wb");
+
+ if (!fout) {
+ fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
+ return;
+ }
+
+ // header
+ {
+ const uint32_t magic = GGML_FILE_MAGIC;
+ const uint32_t version = GGML_FILE_VERSION;
+ const uint32_t n_leafs = cgraph->n_leafs;
+ const uint32_t nodes = cgraph->n_nodes;
+
+ fwrite(&magic, sizeof(uint32_t), 1, fout);
+ fwrite(&version, sizeof(uint32_t), 1, fout);
+ fwrite(&n_leafs, sizeof(uint32_t), 1, fout);
+ fwrite(&nodes, sizeof(uint32_t), 1, fout);
+ fwrite(&size_eval, sizeof(uint64_t), 1, fout);
+ }
+
+ // leafs
+ {
+ for (int i = 0; i < cgraph->n_leafs; ++i) {
+ const struct ggml_tensor * tensor = cgraph->leafs[i];
+
+ const uint32_t type = tensor->type;
+ const uint32_t op = tensor->op;
+ const uint32_t n_dims = tensor->n_dims;
+
+ fwrite(&type, sizeof(uint32_t), 1, fout);
+ fwrite(&op, sizeof(uint32_t), 1, fout);
+ fwrite(&n_dims, sizeof(uint32_t), 1, fout);
+
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+ const uint64_t ne = tensor->ne[j];
+ const uint64_t nb = tensor->nb[j];
+
+ fwrite(&ne, sizeof(uint64_t), 1, fout);
+ fwrite(&nb, sizeof(uint64_t), 1, fout);
+ }
+
+ // store the pointer address
+ {
+ const uint64_t ptr = (uint64_t) tensor->data;
+
+ fwrite(&ptr, sizeof(uint64_t), 1, fout);
+ }
+
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
+
+ // dump the data
+ // TODO: pad this to 32 byte boundary
+ {
+ const size_t size = ggml_nbytes(tensor);
+
+ fwrite(tensor->data, sizeof(char), size, fout);
+ }
+ }
+ }
+
+ // nodes
+ {
+ for (int i = 0; i < cgraph->n_nodes; ++i) {
+ const struct ggml_tensor * tensor = cgraph->nodes[i];
+
+ const uint32_t type = tensor->type;
+ const uint32_t op = tensor->op;
+ const uint32_t n_dims = tensor->n_dims;
+
+ fwrite(&type, sizeof(uint32_t), 1, fout);
+ fwrite(&op, sizeof(uint32_t), 1, fout);
+ fwrite(&n_dims, sizeof(uint32_t), 1, fout);
+
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+ const uint64_t ne = tensor->ne[j];
+ const uint64_t nb = tensor->nb[j];
+
+ fwrite(&ne, sizeof(uint64_t), 1, fout);
+ fwrite(&nb, sizeof(uint64_t), 1, fout);
+ }
+
+ // store the pointer address
+ {
+ const uint64_t ptr = (uint64_t) tensor->data;
+
+ fwrite(&ptr, sizeof(uint64_t), 1, fout);
+ }
+
+ fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout);
+
+ // output the op arguments
+ {
+ struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL };
+
+ args[0] = tensor->src0;
+ args[1] = tensor->src1;
+
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
+ args[2 + j] = tensor->opt[j];
+ }
+
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
+ if (args[j]) {
+ int32_t idx = -1;
+
+ // check if leaf
+ {
+ for (int k = 0; k < cgraph->n_leafs; ++k) {
+ if (args[j] == cgraph->leafs[k]) {
+ idx = k;
+ break;
+ }
+ }
+ }
+
+ // check if node
+ if (idx == -1) {
+ for (int k = 0; k < cgraph->n_nodes; ++k) {
+ if (args[j] == cgraph->nodes[k]) {
+ idx = GGML_MAX_NODES + k;
+ break;
+ }
+ }
+ }
+
+ if (idx == -1) {
+ fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i);
+ return;
+ }
+
+ fwrite(&idx, sizeof(int32_t), 1, fout);
+ } else {
+ const int32_t nul = -1;
+
+ fwrite(&nul, sizeof(int32_t), 1, fout);
+ }
+ }
+ }
+ }
+ }
+
+ fclose(fout);
+ }
+}
+
+struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) {
+ assert(*ctx_data == NULL);
+ assert(*ctx_eval == NULL);
+
+ struct ggml_cgraph result = { 0 };
+
+ struct ggml_tensor * data = NULL;
+
+ // read file into data
+ {
+ FILE * fin = fopen(fname, "rb");
+
+ if (!fin) {
+ fprintf(stderr, "%s: failed to open %s\n", __func__, fname);
+ return result;
+ }
+
+ size_t fsize = 0;
+
+ fseek(fin, 0, SEEK_END);
+ fsize = ftell(fin);
+ fseek(fin, 0, SEEK_SET);
+
+ // create the data context
+ {
+ const size_t overhead = 1*ggml_tensor_overhead();
+
+ struct ggml_init_params params = {
+ .mem_size = fsize + overhead,
+ .mem_buffer = NULL,
+ .no_alloc = false,
+ };
+
+ *ctx_data = ggml_init(params);
+
+ if (!*ctx_data) {
+ fprintf(stderr, "%s: failed to create ggml context\n", __func__);
+ return result;
+ }
+ }
+
+ data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize);
+
+ fread(data->data, sizeof(char), fsize, fin);
+
+ fclose(fin);
+ }
+
+ // populate result
+ {
+ char * ptr = (char *) data->data;
+
+ const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic);
+
+ if (magic != GGML_FILE_MAGIC) {
+ fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic);
+ return result;
+ }
+
+ const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version);
+
+ if (version != GGML_FILE_VERSION) {
+ fprintf(stderr, "%s: invalid version number\n", __func__);
+ return result;
+ }
+
+ const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs);
+ const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes);
+ const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval);
+
+ result.n_leafs = n_leafs;
+ result.n_nodes = n_nodes;
+
+ // create the data context
+ {
+ const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead();
+
+ struct ggml_init_params params = {
+ .mem_size = size_eval + overhead,
+ .mem_buffer = NULL,
+ .no_alloc = true,
+ };
+
+ *ctx_eval = ggml_init(params);
+
+ if (!*ctx_eval) {
+ fprintf(stderr, "%s: failed to create ggml context\n", __func__);
+ return result;
+ }
+ }
+
+ // leafs
+ {
+ uint32_t type;
+ uint32_t op;
+ uint32_t n_dims;
+
+ for (uint32_t i = 0; i < n_leafs; ++i) {
+ type = *(const uint32_t *) ptr; ptr += sizeof(type);
+ op = *(const uint32_t *) ptr; ptr += sizeof(op);
+ n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
+
+ int64_t ne[GGML_MAX_DIMS];
+ size_t nb[GGML_MAX_DIMS];
+
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+ uint64_t ne_cur;
+ uint64_t nb_cur;
+
+ ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
+ nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
+
+ ne[j] = ne_cur;
+ nb[j] = nb_cur;
+ }
+
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+
+ tensor->op = (enum ggml_op) op;
+
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
+
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
+
+ tensor->data = (void *) ptr;
+
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+ tensor->nb[j] = nb[j];
+ }
+
+ result.leafs[i] = tensor;
+
+ ptr += ggml_nbytes(tensor);
+
+ fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
+ }
+ }
+
+ ggml_set_no_alloc(*ctx_eval, false);
+
+ // nodes
+ {
+ uint32_t type;
+ uint32_t op;
+ uint32_t n_dims;
+
+ for (uint32_t i = 0; i < n_nodes; ++i) {
+ type = *(const uint32_t *) ptr; ptr += sizeof(type);
+ op = *(const uint32_t *) ptr; ptr += sizeof(op);
+ n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims);
+
+ int64_t ne[GGML_MAX_DIMS];
+ size_t nb[GGML_MAX_DIMS];
+
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+ uint64_t ne_cur;
+ uint64_t nb_cur;
+
+ ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur);
+ nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur);
+
+ ne[j] = ne_cur;
+ nb[j] = nb_cur;
+ }
+
+ struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne);
+
+ tensor->op = (enum ggml_op) op;
+
+ uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur);
+
+ memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME;
+
+ for (int j = 0; j < GGML_MAX_DIMS; ++j) {
+ tensor->nb[j] = nb[j];
+ }
+
+ // parse args
+ {
+ struct ggml_tensor ** args[2 + GGML_MAX_OPT] = {
+ &tensor->src0,
+ &tensor->src1,
+ };
+
+ for (int j = 0; j < GGML_MAX_OPT; ++j) {
+ args[2 + j] = &tensor->opt[j];
+ }
+
+ for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) {
+ const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx);
+
+ if (arg_idx == -1) {
+ continue;
+ }
+
+ if (arg_idx < GGML_MAX_NODES) {
+ *args[j] = result.leafs[arg_idx];
+ } else {
+ *args[j] = result.nodes[arg_idx - GGML_MAX_NODES];
+ }
+ }
+ }
+
+ result.nodes[i] = tensor;
+
+ fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor));
+ }
+ }
+ }
+
+ return result;
+}
+
void ggml_graph_print(const struct ggml_cgraph * cgraph) {
int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};