From: Georgi Gerganov Date: Mon, 29 May 2023 16:28:07 +0000 (+0300) Subject: ggml : cgraph export/import/eval example + GPU support (#108) X-Git-Tag: upstream/0.0.1642~1426 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=3b697a2264c5dd132abb3268f6b1091536f3f9ff;p=pkg%2Fggml%2Fsources%2Fggml ggml : cgraph export/import/eval example + GPU support (#108) * ggml : cgraph export brainstorming * mnist : code style * mnist : minor * ggml : initial cgraph export * ggml : initial graph import (wip) * ggml : import op args correctly * ggml : add ggml_get_tensor_by_name() * mnist : add compute graph evaluation on CPU example * ggml : add ggml_tensor_overhead() * ggml : rename new functions to ggml_cgraph_... * mnist : add Metal inference skeleton (WIP) * mnist : working on the Metal pipeline (WIP) * mnist : prepare the Metal encoder (WIP) * mnist : first Metal kernel for F32 ADD * mnist : looks like MTLHeap does not work * mnist : initial full pass of MNIST on the GPU (not verified) * mnist : minor cleanup * mnist : full GPU inference works * mnist : use custom soft_max kernel since MPSMatrixSoftMax is bugged * mnist : use constant for soft_max instead of hardcoded 10 * mnist : check multiple predictions (Metal) * mnist : minor * ggml : move cgraph import / export to ggml * mnist : remove common dependencies * mnist : fix soft_max threadgroup size * mnist : init no_alloc member * ggml : improve "get tensor" API --- diff --git a/examples/mnist/CMakeLists.txt b/examples/mnist/CMakeLists.txt index 91b802ae..3ce09249 100644 --- a/examples/mnist/CMakeLists.txt +++ b/examples/mnist/CMakeLists.txt @@ -5,3 +5,29 @@ set(TEST_TARGET mnist) add_executable(${TEST_TARGET} main.cpp) target_link_libraries(${TEST_TARGET} PRIVATE ggml common) +# +# mnist-cpu + +set(TEST_TARGET mnist-cpu) +add_executable(${TEST_TARGET} main-cpu.cpp) +target_link_libraries(${TEST_TARGET} PRIVATE ggml) + +if (APPLE) + # + # mnist-mtl + + find_library(FOUNDATION_LIBRARY Foundation REQUIRED) + find_library(METAL_FRAMEWORK Metal REQUIRED) + find_library(METALKIT_FRAMEWORK MetalKit REQUIRED) + find_library(METALPERFORMANCE_FRAMEWORK MetalPerformanceShaders REQUIRED) + + set(TEST_TARGET mnist-mtl) + add_executable(${TEST_TARGET} main-mtl.cpp main-mtl.h main-mtl.m) + target_link_libraries(${TEST_TARGET} PRIVATE + ggml + ${FOUNDATION_LIBRARY} + ${METAL_FRAMEWORK} + ${METALKIT_FRAMEWORK} + ${METALPERFORMANCE_FRAMEWORK} + ) +endif() diff --git a/examples/mnist/main-cpu.cpp b/examples/mnist/main-cpu.cpp new file mode 100644 index 00000000..48e0ae60 --- /dev/null +++ b/examples/mnist/main-cpu.cpp @@ -0,0 +1,116 @@ +// Use a pre-generated MNIST compute graph for inference on the CPU +// +// You can generate a compute graph using the "mnist" tool: +// +// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte +// +// This command creates the "mnist.ggml" file, which contains the generated compute graph. +// Now, you can re-use the compute graph with the "mnist-cpu" tool: +// +// $ ./bin/mnist-cpu ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte +// + +#include "ggml/ggml.h" + +#include +#include +#include +#include +#include +#include + +// evaluate the MNIST compute graph +// +// - fname_cgraph: path to the compute graph +// - n_threads: number of threads to use +// - digit: 784 pixel values +// +// returns 0 - 9 prediction +int mnist_eval( + const char * fname_cgraph, + const int n_threads, + std::vector digit + ) { + // load the compute graph + struct ggml_context * ctx_data = NULL; + struct ggml_context * ctx_eval = NULL; + + struct ggml_cgraph gfi = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); + gfi.n_threads = n_threads; + + // allocate eval context + // needed during ggml_graph_compute() to allocate a work tensor + static size_t buf_size = gfi.work_size; // TODO + static void * buf = malloc(buf_size); + + struct ggml_init_params params = { + .mem_size = buf_size, + .mem_buffer = buf, + .no_alloc = false, + }; + + struct ggml_context * ctx0 = ggml_init(params); + + struct ggml_tensor * input = ggml_graph_get_tensor(&gfi, "input"); + memcpy(input->data, digit.data(), ggml_nbytes(input)); + + ggml_graph_compute(ctx0, &gfi); + + const float * probs_data = ggml_get_data_f32(ggml_graph_get_tensor(&gfi, "probs")); + + const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data; + + ggml_free(ctx0); + ggml_free(ctx_data); + ggml_free(ctx_eval); + + return prediction; +} + +int main(int argc, char ** argv) { + srand(time(NULL)); + ggml_time_init(); + + if (argc != 3) { + fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]); + exit(0); + } + + uint8_t buf[784]; + std::vector digit; + + // read a random digit from the test set + { + std::ifstream fin(argv[2], std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]); + return 1; + } + + // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000) + fin.seekg(16 + 784 * (rand() % 10000)); + fin.read((char *) &buf, sizeof(buf)); + } + + // render the digit in ASCII + { + digit.resize(sizeof(buf)); + + for (int row = 0; row < 28; row++) { + for (int col = 0; col < 28; col++) { + fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_'); + digit[row*28 + col] = ((float)buf[row*28 + col]); + } + + fprintf(stderr, "\n"); + } + + fprintf(stderr, "\n"); + } + + const int prediction = mnist_eval(argv[1], 1, digit); + + fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction); + + return 0; +} diff --git a/examples/mnist/main-mtl.cpp b/examples/mnist/main-mtl.cpp new file mode 100644 index 00000000..fafe8e61 --- /dev/null +++ b/examples/mnist/main-mtl.cpp @@ -0,0 +1,129 @@ +// Use a pre-generated MNIST compute graph for inference on the M1 GPU via MPS +// +// You can generate a compute graph using the "mnist" tool: +// +// $ ./bin/mnist ./models/mnist/ggml-model-f32.bin ../examples/mnist/models/mnist/t10k-images.idx3-ubyte +// +// This command creates the "mnist.ggml" file, which contains the generated compute graph. +// Now, you can re-use the compute graph on the GPU with the "mnist-mtl" tool: +// +// $ ./bin/mnist-mtl ./models/mnist/mnist.ggml ../examples/mnist/models/mnist/t10k-images.idx3-ubyte +// + +#include "ggml/ggml.h" + +#include "main-mtl.h" + +#include +#include +#include +#include +#include +#include + +// evaluate the MNIST compute graph +// +// - fname_cgraph: path to the compute graph +// - n_threads: number of threads to use +// - digit: 784 pixel values +// +// returns 0 - 9 prediction +int mnist_eval( + const char * fname_cgraph, + const int n_threads, + std::vector digit + ) { + // load the compute graph + struct ggml_context * ctx_data = NULL; + struct ggml_context * ctx_eval = NULL; + + struct ggml_cgraph gf = ggml_graph_import(fname_cgraph, &ctx_data, &ctx_eval); + gf.n_threads = n_threads; + + // allocate eval context + // needed during ggml_graph_compute() to allocate a work tensor + static size_t buf_size = gf.work_size; // TODO + static void * buf = malloc(buf_size); + + struct ggml_init_params params = { + .mem_size = buf_size, + .mem_buffer = buf, + .no_alloc = false, + }; + + struct ggml_context * ctx_work = ggml_init(params); + + // this allocates all Metal resources and memory buffers + auto ctx_mtl = mnist_mtl_init(ctx_data, ctx_eval, ctx_work, &gf); + + int prediction = -1; + + for (int i = 0; i < 1; ++i) { + struct ggml_tensor * input = ggml_graph_get_tensor(&gf, "input"); + + if (i % 2 == 0) { + memcpy(input->data, digit.data(), ggml_nbytes(input)); + } else { + memset(input->data, 0, ggml_nbytes(input)); + } + + // the actual inference happens here + prediction = mnist_mtl_eval(ctx_mtl, &gf); + } + + mnist_mtl_free(ctx_mtl); + + ggml_free(ctx_work); + ggml_free(ctx_data); + ggml_free(ctx_eval); + + return prediction; +} + +int main(int argc, char ** argv) { + srand(time(NULL)); + ggml_time_init(); + + if (argc != 3) { + fprintf(stderr, "Usage: %s models/mnist/mnist.ggml models/mnist/t10k-images.idx3-ubyte\n", argv[0]); + exit(0); + } + + uint8_t buf[784]; + std::vector digit; + + // read a random digit from the test set + { + std::ifstream fin(argv[2], std::ios::binary); + if (!fin) { + fprintf(stderr, "%s: failed to open '%s'\n", __func__, argv[2]); + return 1; + } + + // seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000) + fin.seekg(16 + 784 * (rand() % 10000)); + fin.read((char *) &buf, sizeof(buf)); + } + + // render the digit in ASCII + { + digit.resize(sizeof(buf)); + + for (int row = 0; row < 28; row++) { + for (int col = 0; col < 28; col++) { + fprintf(stderr, "%c ", (float)buf[row*28 + col] > 230 ? '*' : '_'); + digit[row*28 + col] = ((float)buf[row*28 + col]); + } + + fprintf(stderr, "\n"); + } + + fprintf(stderr, "\n"); + } + + const int prediction = mnist_eval(argv[1], 1, digit); + + fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction); + + return 0; +} diff --git a/examples/mnist/main-mtl.h b/examples/mnist/main-mtl.h new file mode 100644 index 00000000..4e661a4d --- /dev/null +++ b/examples/mnist/main-mtl.h @@ -0,0 +1,26 @@ +#pragma once + +struct ggml_context; +struct ggml_cgraph; + +#ifdef __cplusplus +extern "C" { +#endif + +struct ggml_mtl_context; + +struct ggml_mtl_context * mnist_mtl_init( + struct ggml_context * ctx_data, + struct ggml_context * ctx_eval, + struct ggml_context * ctx_work, + struct ggml_cgraph * gf); + +void mnist_mtl_free(struct ggml_mtl_context * ctx); + +int mnist_mtl_eval( + struct ggml_mtl_context * ctx, + struct ggml_cgraph * gf); + +#ifdef __cplusplus +} +#endif diff --git a/examples/mnist/main-mtl.m b/examples/mnist/main-mtl.m new file mode 100644 index 00000000..21bfe7ad --- /dev/null +++ b/examples/mnist/main-mtl.m @@ -0,0 +1,487 @@ +#import "main-mtl.h" + +#import "ggml/ggml.h" + +#import +#import +#import + +// TODO: couldn't get this to work +//#define GGML_MTL_HEAP + +struct ggml_mtl_context { + struct ggml_context * ctx_data; + struct ggml_context * ctx_eval; + struct ggml_context * ctx_work; + + id device; + id queue; + id library; + +#ifdef GGML_MTL_HEAP + id heap_data; + id heap_eval; +#else + id buffer_data; + id buffer_eval; +#endif + + id out; + + // custom kernels + id function_add; + id pipeline_add; + + id function_relu; + id pipeline_relu; + + id function_soft_max; + id pipeline_soft_max; +}; + +// MSL code +NSString * const msl_library_mnist = @"\ +#include \n\ +using namespace metal; \n\ + \n\ +#define MAX(x, y) ((x) > (y) ? (x) : (y)) \n\ + \n\ +constant int k_digits [[function_constant(0)]]; \n\ + \n\ +kernel void kernel_add( \n\ + device const float * src0, \n\ + device const float * src1, \n\ + device float * dst, \n\ + uint gid[[thread_position_in_grid]]) { \n\ + dst[gid] = src0[gid] + src1[gid]; \n\ +} \n\ + \n\ +kernel void kernel_relu( \n\ + device const float * src, \n\ + device float * dst, \n\ + uint gid[[thread_position_in_grid]]) { \n\ + dst[gid] = max(0.0f, src[gid]); \n\ +} \n\ + \n\ +kernel void kernel_soft_max( \n\ + device const float * src, \n\ + device float * dst, \n\ + uint gid[[thread_position_in_grid]]) { \n\ + float max = 0.0f; \n\ + for (int i = 0; i < k_digits; i++) { \n\ + max = MAX(max, src[i]); \n\ + } \n\ + float sum = 0.0f; \n\ + for (int i = 0; i < k_digits; i++) { \n\ + dst[i] = exp(src[i] - max); \n\ + sum += dst[i]; \n\ + } \n\ + for (int i = 0; i < k_digits; i++) { \n\ + dst[i] /= sum; \n\ + } \n\ +} \n\ +"; + +struct ggml_mtl_context * mnist_mtl_init( + struct ggml_context * ctx_data, + struct ggml_context * ctx_eval, + struct ggml_context * ctx_work, + struct ggml_cgraph * gf) { + fprintf(stderr, "%s: allocating\n", __func__); + + struct ggml_mtl_context * ctx = malloc(sizeof(struct ggml_mtl_context)); + + ctx->ctx_data = ctx_data; + ctx->ctx_eval = ctx_eval; + ctx->ctx_work = ctx_work; + + ctx->device = MTLCreateSystemDefaultDevice(); + ctx->queue = [ctx->device newCommandQueue]; + + // determine if we can use MPS + if (MPSSupportsMTLDevice(ctx->device)) { + fprintf(stderr, "%s: using MPS\n", __func__); + } else { + fprintf(stderr, "%s: not using MPS\n", __func__); + GGML_ASSERT(false && "MPS not supported"); + } + + // compile from source string and show compile log + { + NSError * error = nil; + ctx->library = [ctx->device newLibraryWithSource:msl_library_mnist options:nil error:&error]; + if (error) { + fprintf(stderr, "%s: error: %s\n", __func__, [[error description] UTF8String]); + exit(1); + } + } + + // load kernels + { + const int k_digits = ggml_graph_get_tensor(gf, "probs")->ne[0]; + + MTLFunctionConstantValues * constants = [MTLFunctionConstantValues new]; + [constants setConstantValue:&k_digits type:MTLDataTypeInt withName:@"k_digits"]; + + ctx->function_add = [ctx->library newFunctionWithName:@"kernel_add"]; + ctx->pipeline_add = [ctx->device newComputePipelineStateWithFunction:ctx->function_add error:nil]; + fprintf(stderr, "%s: loaded kernel_add: %p\n", __func__, ctx->pipeline_add); + + ctx->function_relu = [ctx->library newFunctionWithName:@"kernel_relu"]; + ctx->pipeline_relu = [ctx->device newComputePipelineStateWithFunction:ctx->function_relu error:nil]; + fprintf(stderr, "%s: loaded kernel_relu: %p\n", __func__, ctx->pipeline_relu); + + ctx->function_soft_max = [ctx->library newFunctionWithName:@"kernel_soft_max" constantValues:constants error:nil]; + ctx->pipeline_soft_max = [ctx->device newComputePipelineStateWithFunction:ctx->function_soft_max error:nil]; + fprintf(stderr, "%s: loaded kernel_soft_max: %p\n", __func__, ctx->pipeline_soft_max); + } + +#ifdef GGML_MTL_HEAP + // MTLHeap approach + + // pin ctx_data memory to GPU + // use MTLStorageModeShared to allow us to initialize the weights from the CPU + // TODO: how to use MTLStorageModeManaged? + // TODO: see if we can avoid this copy somehow + { + const void * mem_buffer = ggml_get_mem_buffer(ctx_data); + const size_t mem_size = ggml_get_mem_size(ctx_data); + + MTLHeapDescriptor * heap_desc = [MTLHeapDescriptor new]; + heap_desc.storageMode = MTLStorageModeShared; + heap_desc.size = mem_size; + + printf("heap_desc.size = %zu\n", mem_size); + + ctx->heap_data = [ctx->device newHeapWithDescriptor:heap_desc]; + [ctx->heap_data setPurgeableState:MTLPurgeableStateNonVolatile]; // TODO: is this needed? + ctx->heap_data.label = @"heap_data"; + + printf("ctx->heap_data.size = %zu\n", [ctx->heap_data size]); + + id buffer = [ctx->heap_data newBufferWithLength:mem_size options:MTLResourceStorageModeShared]; + if (!buffer) { + fprintf(stderr, "%s: error: failed to allocate buffer\n", __func__); + exit(1); + } + + // copy data from CPU to GPU + memcpy([buffer contents], mem_buffer, mem_size); + + fprintf(stderr, "%s: allocated data heap, size = %zu\n", __func__, mem_size); + } + + // pin ctx_eval memory to GPU + // this heap will be used for the intermediate results of the evaluation + { + const size_t mem_size = ggml_get_mem_size(ctx_eval); + + MTLHeapDescriptor * heap_desc = [MTLHeapDescriptor new]; + heap_desc.storageMode = MTLStorageModePrivate; // GPU only + heap_desc.size = mem_size; + + ctx->heap_eval = [ctx->device newHeapWithDescriptor:heap_desc]; + [ctx->heap_eval setPurgeableState:MTLPurgeableStateNonVolatile]; // TODO: is this needed? + + fprintf(stderr, "%s: allocated eval heap, size = %zu\n", __func__, mem_size); + } +#else + // MTLBuffer approach + + // pin ctx_data memory to GPU + // use MTLStorageModeShared to allow us to initialize the weights from the CPU + // TODO: how to use MTLStorageModeManaged? + // TODO: see if we can avoid this copy somehow + { + const void * mem_buffer = ggml_get_mem_buffer(ctx_data); + const size_t mem_size = ggml_get_mem_size(ctx_data); + + ctx->buffer_data = [ctx->device newBufferWithBytes:mem_buffer length:mem_size options:MTLResourceStorageModeShared]; + + fprintf(stderr, "%s: allocated data buffer, size = %zu\n", __func__, mem_size); + } + + // pin ctx_eval memory to GPU + // this buffer will be used for the intermediate results of the evaluation + { + const size_t mem_size = ggml_get_mem_size(ctx_eval); + + ctx->buffer_eval = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModePrivate]; + + fprintf(stderr, "%s: allocated eval buffer, size = %zu\n", __func__, mem_size); + } +#endif + + // allocate buffer for result extraction + { + const size_t mem_size = ggml_nbytes(gf->nodes[gf->n_nodes - 1]); + + ctx->out = [ctx->device newBufferWithLength:mem_size options:MTLResourceStorageModeShared]; + + fprintf(stderr, "%s: allocated out buffer, size = %zu\n", __func__, mem_size); + } + + return ctx; +} + +void mnist_mtl_free(struct ggml_mtl_context * ctx) { + fprintf(stderr, "%s: deallocating\n", __func__); + + free(ctx); +} + +#ifdef GGML_MTL_HEAP + +// make a view of the respective MTL heap +id mnist_mtl_get_buffer_on_heap(struct ggml_mtl_context * ctx, struct ggml_tensor * t) { + const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data); + const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval); + + const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval); + + const size_t t_size = ggml_nbytes(t); + const size_t t_offs = is_data ? offs_data : offs_eval; + + id result; + + if (is_data) { + fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); + result = [ctx->heap_data newBufferWithLength:t_size options:MTLResourceStorageModeShared offset:t_offs]; + } else { + fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); + result = [ctx->heap_eval newBufferWithLength:t_size options:MTLResourceStorageModePrivate offset:t_offs]; + } + + if (result == nil) { + fprintf(stderr, "%s: error: buffer is nil\n", __func__); + GGML_ASSERT(false); + } + + return result; +} + +#else + +// get data / eval buffer + offset +id mnist_mtl_get_buffer(struct ggml_mtl_context * ctx, struct ggml_tensor * t, size_t * offs) { + const int64_t offs_data = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_data); + const int64_t offs_eval = (int64_t) t->data - (int64_t) ggml_get_mem_buffer(ctx->ctx_eval); + + const bool is_data = (offs_eval < 0) || (offs_data >= 0 && offs_data < offs_eval); + + const size_t t_size = ggml_nbytes(t); + const size_t t_offs = is_data ? offs_data : offs_eval; + + id result; + + if (is_data) { + fprintf(stderr, "%s: data tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); + result = ctx->buffer_data; + } else { + fprintf(stderr, "%s: eval tensor '%16s', offs = %8ld, size = %8ld\n", __func__, t->name, t_offs, t_size); + result = ctx->buffer_eval; + } + + if (result == nil) { + fprintf(stderr, "%s: error: buffer is nil\n", __func__); + GGML_ASSERT(false); + } + + if (offs != nil) { + *offs = t_offs; + } + + return result; +} + +#endif + +int mnist_mtl_eval( + struct ggml_mtl_context * ctx, + struct ggml_cgraph * gf) { + fprintf(stderr, "%s: evaluating\n", __func__); + + id command_buffer = [ctx->queue commandBuffer]; + id encoder = nil; + + size_t offs_src0; + size_t offs_src1; + size_t offs_dst; + + // copy the input data to the GPU + { + struct ggml_tensor * inp = ggml_graph_get_tensor(gf, "input"); + + id id_dst = mnist_mtl_get_buffer(ctx, inp, &offs_src0); + + memcpy(id_dst.contents + offs_src0, inp->data, ggml_nbytes(inp)); + } + + for (int i = 0; i < gf->n_nodes; ++i) { + fprintf(stderr, "%s: encoding node %3d, op = %8s\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + + switch (gf->nodes[i]->op) { + case GGML_OP_ADD: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } + + id id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0); + id id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1); + id id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i], &offs_dst); + + [encoder setComputePipelineState:ctx->pipeline_add]; + [encoder setBuffer:id_src0 offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_src1 offset:offs_src1 atIndex:1]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:2]; + + const int64_t n = ggml_nelements(gf->nodes[i]); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_RELU: + { + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } + + id id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0); + id id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i], &offs_dst); + + [encoder setComputePipelineState:ctx->pipeline_relu]; + [encoder setBuffer:id_src offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + const int64_t n = ggml_nelements(gf->nodes[i]); + + [encoder dispatchThreadgroups:MTLSizeMake(n, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; + } break; + case GGML_OP_SOFT_MAX: + { +#if 0 + // NOTE: MPSMatrixSoftMax is not working properly, probably there is a bug + + if (encoder != nil) { + [encoder endEncoding]; + encoder = nil; + } + + // use MPSMatrixSoftMax + id id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0); + id id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i], &offs_dst); + + MPSMatrixDescriptor * desc = [MPSMatrixDescriptor + matrixDescriptorWithRows:1 columns:gf->nodes[i]->ne[0] rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32]; + + MPSMatrix * mat_src = [[MPSMatrix alloc] initWithBuffer:id_src offset:offs_src0 descriptor:desc]; + MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst descriptor:desc]; + + MPSMatrixSoftMax * softmax = [[MPSMatrixSoftMax alloc] initWithDevice:ctx->device]; + + [softmax encodeToCommandBuffer:command_buffer inputMatrix:mat_src resultMatrix:mat_dst]; +#else + if (encoder == nil) { + encoder = [command_buffer computeCommandEncoder]; + } + + id id_src = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0); + id id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i], &offs_dst); + + [encoder setComputePipelineState:ctx->pipeline_soft_max]; + [encoder setBuffer:id_src offset:offs_src0 atIndex:0]; + [encoder setBuffer:id_dst offset:offs_dst atIndex:1]; + + [encoder dispatchThreadgroups:MTLSizeMake(1, 1, 1) threadsPerThreadgroup:MTLSizeMake(1, 1, 1)]; +#endif + } break; + case GGML_OP_MUL_MAT: + { + if (encoder != nil) { + [encoder endEncoding]; + encoder = nil; + } + + // use MPSMatrixMultiplication + id id_src0 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src0, &offs_src0); + id id_src1 = mnist_mtl_get_buffer(ctx, gf->nodes[i]->src1, &offs_src1); + id id_dst = mnist_mtl_get_buffer(ctx, gf->nodes[i], &offs_dst); + + const int64_t ncols0 = gf->nodes[i]->src0->ne[0]; + const int64_t nrows0 = gf->nodes[i]->src0->ne[1]; + + const int64_t ncols1 = gf->nodes[i]->src1->ne[0]; + const int64_t nrows1 = gf->nodes[i]->src1->ne[1]; + + const int64_t ncols2 = gf->nodes[i]->ne[0]; + const int64_t nrows2 = gf->nodes[i]->ne[1]; + + GGML_ASSERT(ncols0 == ncols1); + + MPSMatrixDescriptor * desc0 = [MPSMatrixDescriptor + matrixDescriptorWithRows:nrows0 columns:ncols0 rowBytes:gf->nodes[i]->src0->nb[1] dataType:MPSDataTypeFloat32]; + MPSMatrixDescriptor * desc1 = [MPSMatrixDescriptor + matrixDescriptorWithRows:nrows1 columns:ncols1 rowBytes:gf->nodes[i]->src1->nb[1] dataType:MPSDataTypeFloat32]; + MPSMatrixDescriptor * desc2 = [MPSMatrixDescriptor + matrixDescriptorWithRows:nrows2 columns:ncols2 rowBytes:gf->nodes[i]->nb[1] dataType:MPSDataTypeFloat32]; + + MPSMatrix * mat_src0 = [[MPSMatrix alloc] initWithBuffer:id_src0 offset:offs_src0 descriptor:desc0]; + MPSMatrix * mat_src1 = [[MPSMatrix alloc] initWithBuffer:id_src1 offset:offs_src1 descriptor:desc1]; + MPSMatrix * mat_dst = [[MPSMatrix alloc] initWithBuffer:id_dst offset:offs_dst descriptor:desc2]; + + MPSMatrixMultiplication * mul = [[MPSMatrixMultiplication alloc] initWithDevice:ctx->device + transposeLeft:false transposeRight:true resultRows:nrows1 resultColumns:nrows0 interiorColumns:ncols0 alpha:1.0 beta:0.0]; + + [mul encodeToCommandBuffer:command_buffer leftMatrix:mat_src1 rightMatrix:mat_src0 resultMatrix:mat_dst]; + } break; + default: + fprintf(stderr, "%s: node %3d, op = %8s not implemented\n", __func__, i, ggml_op_name(gf->nodes[i]->op)); + GGML_ASSERT(false); + return -1; + } + } + + // extract results from the GPU + { + if (encoder != nil) { + [encoder endEncoding]; + encoder = nil; + } + + struct ggml_tensor * out = gf->nodes[gf->n_nodes - 1]; + + id id_src = mnist_mtl_get_buffer(ctx, out, &offs_src0); + id id_dst = ctx->out; + + id encoder_blit = [command_buffer blitCommandEncoder]; + [encoder_blit copyFromBuffer:id_src sourceOffset:offs_src0 toBuffer:id_dst destinationOffset:0 size:ggml_nbytes(out)]; + [encoder_blit endEncoding]; + } + + [command_buffer commit]; + [command_buffer waitUntilCompleted]; + + { + const double time_elapsed = [command_buffer GPUEndTime] - [command_buffer GPUStartTime]; + fprintf(stderr, "%s: time elapsed = %f\n", __func__, time_elapsed); + } + + // select the most probable digit + int result = -1; + { + const float * probs = ctx->out.contents; + + float prob = probs[0]; + + for (int i = 0; i < 10; ++i) { + fprintf(stderr, "%s: probs[%2d] = %f\n", __func__, i, probs[i]); + + if (probs[i] > prob) { + result = i; + prob = probs[i]; + } + } + } + + return result; +} diff --git a/examples/mnist/main.cpp b/examples/mnist/main.cpp index c84eedd9..51274800 100644 --- a/examples/mnist/main.cpp +++ b/examples/mnist/main.cpp @@ -165,7 +165,8 @@ bool mnist_model_load(const std::string & fname, mnist_model & model) { int mnist_eval( const mnist_model & model, const int n_threads, - std::vector digit + std::vector digit, + const char * fname_cgraph ) { const auto & hparams = model.hparams; @@ -176,6 +177,7 @@ int mnist_eval( struct ggml_init_params params = { .mem_size = buf_size, .mem_buffer = buf, + .no_alloc = false, }; struct ggml_context * ctx0 = ggml_init(params); @@ -192,14 +194,23 @@ int mnist_eval( // soft max ggml_tensor * probs = ggml_soft_max(ctx0, fc2); + ggml_set_name(probs, "probs"); - // run the computation + // build / export / run the computation graph ggml_build_forward_expand(&gf, probs); ggml_graph_compute (ctx0, &gf); //ggml_graph_print (&gf); ggml_graph_dump_dot(&gf, NULL, "mnist.dot"); + if (fname_cgraph) { + // export the compute graph for later use + // see the "mnist-cpu" example + ggml_graph_export(&gf, "mnist.ggml"); + + fprintf(stderr, "%s: exported compute graph to '%s'\n", __func__, fname_cgraph); + } + const float * probs_data = ggml_get_data_f32(probs); const int prediction = std::max_element(probs_data, probs_data + 10) - probs_data; @@ -213,30 +224,31 @@ int mnist_eval( extern "C" { #endif -int wasm_eval(uint8_t *digitPtr) -{ +int wasm_eval(uint8_t * digitPtr) { mnist_model model; if (!mnist_model_load("models/mnist/ggml-model-f32.bin", model)) { fprintf(stderr, "error loading model\n"); return -1; } std::vector digit(digitPtr, digitPtr + 784); - int result = mnist_eval(model, 1, digit); + int result = mnist_eval(model, 1, digit, nullptr); ggml_free(model.ctx); + return result; } -int wasm_random_digit(char *digitPtr) -{ +int wasm_random_digit(char * digitPtr) { auto fin = std::ifstream("models/mnist/t10k-images.idx3-ubyte", std::ios::binary); if (!fin) { fprintf(stderr, "failed to open digits file\n"); return 0; } srand(time(NULL)); + // Seek to a random digit: 16-byte header + 28*28 * (random 0 - 10000) fin.seekg(16 + 784 * (rand() % 10000)); fin.read(digitPtr, 784); + return 1; } @@ -300,7 +312,9 @@ int main(int argc, char ** argv) { fprintf(stderr, "\n"); } - fprintf(stdout, "%s: predicted digit is %d\n", __func__, mnist_eval(model, 1, digit)); + const int prediction = mnist_eval(model, 1, digit, "mnist.ggml"); + + fprintf(stdout, "%s: predicted digit is %d\n", __func__, prediction); ggml_free(model.ctx); diff --git a/include/ggml/ggml.h b/include/ggml/ggml.h index 55813828..60c0ad8b 100644 --- a/include/ggml/ggml.h +++ b/include/ggml/ggml.h @@ -451,9 +451,12 @@ extern "C" { GGML_API size_t ggml_used_mem(const struct ggml_context * ctx); - GGML_API size_t ggml_set_scratch(struct ggml_context * ctx, struct ggml_scratch scratch); + GGML_API size_t ggml_set_scratch (struct ggml_context * ctx, struct ggml_scratch scratch); GGML_API void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc); + GGML_API void * ggml_get_mem_buffer(struct ggml_context * ctx); + GGML_API size_t ggml_get_mem_size (struct ggml_context * ctx); + GGML_API struct ggml_tensor * ggml_new_tensor( struct ggml_context * ctx, enum ggml_type type, @@ -492,6 +495,8 @@ extern "C" { GGML_API struct ggml_tensor * ggml_dup_tensor (struct ggml_context * ctx, const struct ggml_tensor * src); GGML_API struct ggml_tensor * ggml_view_tensor(struct ggml_context * ctx, const struct ggml_tensor * src); + GGML_API struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name); + GGML_API struct ggml_tensor * ggml_set_zero(struct ggml_tensor * tensor); GGML_API struct ggml_tensor * ggml_set_i32 (struct ggml_tensor * tensor, int32_t value); GGML_API struct ggml_tensor * ggml_set_f32 (struct ggml_tensor * tensor, float value); @@ -978,7 +983,10 @@ extern "C" { GGML_API void ggml_graph_compute(struct ggml_context * ctx, struct ggml_cgraph * cgraph); GGML_API void ggml_graph_reset (struct ggml_cgraph * cgraph); - GGML_API struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name); + GGML_API struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name); + + GGML_API void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname); + GGML_API struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval); // print info and performance information for the graph GGML_API void ggml_graph_print(const struct ggml_cgraph * cgraph); diff --git a/src/ggml.c b/src/ggml.c index b4570eb5..4cd0d721 100644 --- a/src/ggml.c +++ b/src/ggml.c @@ -4030,6 +4030,14 @@ void ggml_set_no_alloc(struct ggml_context * ctx, bool no_alloc) { ctx->no_alloc = no_alloc; } +void * ggml_get_mem_buffer(struct ggml_context * ctx) { + return ctx->mem_buffer; +} + +size_t ggml_get_mem_size(struct ggml_context * ctx) { + return ctx->mem_size; +} + // IMPORTANT: // when creating "opt" tensors, always save and load the scratch buffer // this is an error prone process, but it is necessary to support inplace @@ -4523,6 +4531,23 @@ struct ggml_tensor * ggml_view_tensor( return result; } +struct ggml_tensor * ggml_get_tensor(struct ggml_context * ctx, const char * name) { + struct ggml_object * obj = ctx->objects_begin; + + char * const mem_buffer = ctx->mem_buffer; + + while (obj != NULL) { + struct ggml_tensor * cur = (struct ggml_tensor *)(mem_buffer + obj->offs); + if (strcmp(cur->name, name) == 0) { + return cur; + } + + obj = obj->next; + } + + return NULL; +} + //////////////////////////////////////////////////////////////////////////////// // ggml_dup @@ -14532,7 +14557,7 @@ void ggml_graph_reset(struct ggml_cgraph * cgraph) { } } -struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const char * name) { +struct ggml_tensor * ggml_graph_get_tensor(struct ggml_cgraph * cgraph, const char * name) { for (int i = 0; i < cgraph->n_leafs; i++) { struct ggml_tensor * leaf = cgraph->leafs[i]; @@ -14552,6 +14577,461 @@ struct ggml_tensor * ggml_get_tensor_by_name(struct ggml_cgraph * cgraph, const return NULL; } +static void ggml_graph_export_leaf(const struct ggml_tensor * tensor, FILE * fout) { + const int64_t * ne = tensor->ne; + const size_t * nb = tensor->nb; + + fprintf(fout, "%-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %16p %16s\n", + ggml_type_name(tensor->type), + ggml_op_name (tensor->op), + tensor->n_dims, + ne[0], ne[1], ne[2], ne[3], + nb[0], nb[1], nb[2], nb[3], + tensor->data, + tensor->name); +} + +static void ggml_graph_export_node(const struct ggml_tensor * tensor, const char * arg, FILE * fout) { + const int64_t * ne = tensor->ne; + const size_t * nb = tensor->nb; + + fprintf(fout, "%-6s %-6s %-12s %8d %8lld %8lld %8lld %8lld %16zu %16zu %16zu %16zu %8d %16p %16s\n", + arg, + ggml_type_name(tensor->type), + ggml_op_name (tensor->op), + tensor->n_dims, + ne[0], ne[1], ne[2], ne[3], + nb[0], nb[1], nb[2], nb[3], + tensor->n_tasks, + tensor->data, + tensor->name); +} + +void ggml_graph_export(const struct ggml_cgraph * cgraph, const char * fname) { + assert(cgraph->work == NULL); + assert(cgraph->work_size == 0); + + uint64_t size_eval = 0; + + // compute size of intermediate results + // TODO: does not take into account scratch buffers !!!! + for (int i = 0; i < cgraph->n_nodes; ++i) { + size_eval += ggml_nbytes(cgraph->nodes[i]); + } + + // print + { + FILE * fout = stdout; + + fprintf(fout, "\n"); + fprintf(fout, "%-16s %8x\n", "magic", GGML_FILE_MAGIC); + fprintf(fout, "%-16s %8d\n", "version", GGML_FILE_VERSION); + fprintf(fout, "%-16s %8d\n", "leafs", cgraph->n_leafs); + fprintf(fout, "%-16s %8d\n", "nodes", cgraph->n_nodes); + fprintf(fout, "%-16s %8llu\n", "eval", size_eval); + + // header + fprintf(fout, "\n"); + fprintf(fout, "%-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %16s %16s\n", + "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "DATA", "NAME"); + + for (int i = 0; i < cgraph->n_leafs; ++i) { + ggml_graph_export_leaf(cgraph->leafs[i], fout); + + GGML_ASSERT(cgraph->leafs[i]->op == GGML_OP_NONE); + GGML_ASSERT(cgraph->leafs[i]->src0 == NULL); + GGML_ASSERT(cgraph->leafs[i]->src1 == NULL); + } + + // header + fprintf(fout, "\n"); + fprintf(fout, "%-6s %-6s %-12s %8s %8s %8s %8s %8s %16s %16s %16s %16s %8s %16s %16s\n", + "ARG", "TYPE", "OP", "NDIMS", "NE0", "NE1", "NE2", "NE3", "NB0", "NB1", "NB2", "NB3", "NTASKS", "DATA", "NAME"); + + for (int i = 0; i < cgraph->n_nodes; ++i) { + ggml_graph_export_node(cgraph->nodes[i], "DST", fout); + + if (cgraph->nodes[i]->src0) { + ggml_graph_export_node(cgraph->nodes[i]->src0, "SRC0", fout); + } + + if (cgraph->nodes[i]->src1) { + ggml_graph_export_node(cgraph->nodes[i]->src1, "SRC1", fout); + } + + for (int j = 0; j < GGML_MAX_OPT; ++j) { + if (cgraph->nodes[i]->opt[j]) { + ggml_graph_export_node(cgraph->nodes[i]->opt[j], "OPT", fout); + } + } + + fprintf(fout, "\n"); + } + + fprintf(fout, "\n"); + } + + // write binary data + { + FILE * fout = fopen(fname, "wb"); + + if (!fout) { + fprintf(stderr, "%s: failed to open %s\n", __func__, fname); + return; + } + + // header + { + const uint32_t magic = GGML_FILE_MAGIC; + const uint32_t version = GGML_FILE_VERSION; + const uint32_t n_leafs = cgraph->n_leafs; + const uint32_t nodes = cgraph->n_nodes; + + fwrite(&magic, sizeof(uint32_t), 1, fout); + fwrite(&version, sizeof(uint32_t), 1, fout); + fwrite(&n_leafs, sizeof(uint32_t), 1, fout); + fwrite(&nodes, sizeof(uint32_t), 1, fout); + fwrite(&size_eval, sizeof(uint64_t), 1, fout); + } + + // leafs + { + for (int i = 0; i < cgraph->n_leafs; ++i) { + const struct ggml_tensor * tensor = cgraph->leafs[i]; + + const uint32_t type = tensor->type; + const uint32_t op = tensor->op; + const uint32_t n_dims = tensor->n_dims; + + fwrite(&type, sizeof(uint32_t), 1, fout); + fwrite(&op, sizeof(uint32_t), 1, fout); + fwrite(&n_dims, sizeof(uint32_t), 1, fout); + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + const uint64_t ne = tensor->ne[j]; + const uint64_t nb = tensor->nb[j]; + + fwrite(&ne, sizeof(uint64_t), 1, fout); + fwrite(&nb, sizeof(uint64_t), 1, fout); + } + + // store the pointer address + { + const uint64_t ptr = (uint64_t) tensor->data; + + fwrite(&ptr, sizeof(uint64_t), 1, fout); + } + + fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); + + // dump the data + // TODO: pad this to 32 byte boundary + { + const size_t size = ggml_nbytes(tensor); + + fwrite(tensor->data, sizeof(char), size, fout); + } + } + } + + // nodes + { + for (int i = 0; i < cgraph->n_nodes; ++i) { + const struct ggml_tensor * tensor = cgraph->nodes[i]; + + const uint32_t type = tensor->type; + const uint32_t op = tensor->op; + const uint32_t n_dims = tensor->n_dims; + + fwrite(&type, sizeof(uint32_t), 1, fout); + fwrite(&op, sizeof(uint32_t), 1, fout); + fwrite(&n_dims, sizeof(uint32_t), 1, fout); + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + const uint64_t ne = tensor->ne[j]; + const uint64_t nb = tensor->nb[j]; + + fwrite(&ne, sizeof(uint64_t), 1, fout); + fwrite(&nb, sizeof(uint64_t), 1, fout); + } + + // store the pointer address + { + const uint64_t ptr = (uint64_t) tensor->data; + + fwrite(&ptr, sizeof(uint64_t), 1, fout); + } + + fwrite(tensor->name, sizeof(char), GGML_MAX_NAME, fout); + + // output the op arguments + { + struct ggml_tensor * args[2 + GGML_MAX_OPT] = { NULL }; + + args[0] = tensor->src0; + args[1] = tensor->src1; + + for (int j = 0; j < GGML_MAX_OPT; ++j) { + args[2 + j] = tensor->opt[j]; + } + + for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { + if (args[j]) { + int32_t idx = -1; + + // check if leaf + { + for (int k = 0; k < cgraph->n_leafs; ++k) { + if (args[j] == cgraph->leafs[k]) { + idx = k; + break; + } + } + } + + // check if node + if (idx == -1) { + for (int k = 0; k < cgraph->n_nodes; ++k) { + if (args[j] == cgraph->nodes[k]) { + idx = GGML_MAX_NODES + k; + break; + } + } + } + + if (idx == -1) { + fprintf(stderr, "%s: failed to find tensor, arg = %d, node = %d\n", __func__, j, i); + return; + } + + fwrite(&idx, sizeof(int32_t), 1, fout); + } else { + const int32_t nul = -1; + + fwrite(&nul, sizeof(int32_t), 1, fout); + } + } + } + } + } + + fclose(fout); + } +} + +struct ggml_cgraph ggml_graph_import(const char * fname, struct ggml_context ** ctx_data, struct ggml_context ** ctx_eval) { + assert(*ctx_data == NULL); + assert(*ctx_eval == NULL); + + struct ggml_cgraph result = { 0 }; + + struct ggml_tensor * data = NULL; + + // read file into data + { + FILE * fin = fopen(fname, "rb"); + + if (!fin) { + fprintf(stderr, "%s: failed to open %s\n", __func__, fname); + return result; + } + + size_t fsize = 0; + + fseek(fin, 0, SEEK_END); + fsize = ftell(fin); + fseek(fin, 0, SEEK_SET); + + // create the data context + { + const size_t overhead = 1*ggml_tensor_overhead(); + + struct ggml_init_params params = { + .mem_size = fsize + overhead, + .mem_buffer = NULL, + .no_alloc = false, + }; + + *ctx_data = ggml_init(params); + + if (!*ctx_data) { + fprintf(stderr, "%s: failed to create ggml context\n", __func__); + return result; + } + } + + data = ggml_new_tensor_1d(*ctx_data, GGML_TYPE_I8, fsize); + + fread(data->data, sizeof(char), fsize, fin); + + fclose(fin); + } + + // populate result + { + char * ptr = (char *) data->data; + + const uint32_t magic = *(const uint32_t *) ptr; ptr += sizeof(magic); + + if (magic != GGML_FILE_MAGIC) { + fprintf(stderr, "%s: invalid magic number, got %08x\n", __func__, magic); + return result; + } + + const uint32_t version = *(const uint32_t *) ptr; ptr += sizeof(version); + + if (version != GGML_FILE_VERSION) { + fprintf(stderr, "%s: invalid version number\n", __func__); + return result; + } + + const uint32_t n_leafs = *(const uint32_t *) ptr; ptr += sizeof(n_leafs); + const uint32_t n_nodes = *(const uint32_t *) ptr; ptr += sizeof(n_nodes); + const uint64_t size_eval = *(const uint64_t *) ptr; ptr += sizeof(size_eval); + + result.n_leafs = n_leafs; + result.n_nodes = n_nodes; + + // create the data context + { + const size_t overhead = (n_leafs + n_nodes)*ggml_tensor_overhead(); + + struct ggml_init_params params = { + .mem_size = size_eval + overhead, + .mem_buffer = NULL, + .no_alloc = true, + }; + + *ctx_eval = ggml_init(params); + + if (!*ctx_eval) { + fprintf(stderr, "%s: failed to create ggml context\n", __func__); + return result; + } + } + + // leafs + { + uint32_t type; + uint32_t op; + uint32_t n_dims; + + for (uint32_t i = 0; i < n_leafs; ++i) { + type = *(const uint32_t *) ptr; ptr += sizeof(type); + op = *(const uint32_t *) ptr; ptr += sizeof(op); + n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); + + int64_t ne[GGML_MAX_DIMS]; + size_t nb[GGML_MAX_DIMS]; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + uint64_t ne_cur; + uint64_t nb_cur; + + ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur); + nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur); + + ne[j] = ne_cur; + nb[j] = nb_cur; + } + + struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + + tensor->op = (enum ggml_op) op; + + uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); + + memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; + + tensor->data = (void *) ptr; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + tensor->nb[j] = nb[j]; + } + + result.leafs[i] = tensor; + + ptr += ggml_nbytes(tensor); + + fprintf(stderr, "%s: loaded leaf %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + } + } + + ggml_set_no_alloc(*ctx_eval, false); + + // nodes + { + uint32_t type; + uint32_t op; + uint32_t n_dims; + + for (uint32_t i = 0; i < n_nodes; ++i) { + type = *(const uint32_t *) ptr; ptr += sizeof(type); + op = *(const uint32_t *) ptr; ptr += sizeof(op); + n_dims = *(const uint32_t *) ptr; ptr += sizeof(n_dims); + + int64_t ne[GGML_MAX_DIMS]; + size_t nb[GGML_MAX_DIMS]; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + uint64_t ne_cur; + uint64_t nb_cur; + + ne_cur = *(const uint64_t *) ptr; ptr += sizeof(ne_cur); + nb_cur = *(const uint64_t *) ptr; ptr += sizeof(nb_cur); + + ne[j] = ne_cur; + nb[j] = nb_cur; + } + + struct ggml_tensor * tensor = ggml_new_tensor(*ctx_eval, (enum ggml_type) type, n_dims, ne); + + tensor->op = (enum ggml_op) op; + + uint64_t ptr_cur = *(const uint64_t *) ptr; ptr += sizeof(ptr_cur); + + memcpy(tensor->name, ptr, GGML_MAX_NAME); ptr += GGML_MAX_NAME; + + for (int j = 0; j < GGML_MAX_DIMS; ++j) { + tensor->nb[j] = nb[j]; + } + + // parse args + { + struct ggml_tensor ** args[2 + GGML_MAX_OPT] = { + &tensor->src0, + &tensor->src1, + }; + + for (int j = 0; j < GGML_MAX_OPT; ++j) { + args[2 + j] = &tensor->opt[j]; + } + + for (int j = 0; j < 2 + GGML_MAX_OPT; ++j) { + const int32_t arg_idx = *(const int32_t *) ptr; ptr += sizeof(arg_idx); + + if (arg_idx == -1) { + continue; + } + + if (arg_idx < GGML_MAX_NODES) { + *args[j] = result.leafs[arg_idx]; + } else { + *args[j] = result.nodes[arg_idx - GGML_MAX_NODES]; + } + } + } + + result.nodes[i] = tensor; + + fprintf(stderr, "%s: loaded node %d: '%16s', %3d dims, %9zu bytes\n", __func__, i, tensor->name, n_dims, ggml_nbytes(tensor)); + } + } + } + + return result; +} + void ggml_graph_print(const struct ggml_cgraph * cgraph) { int64_t perf_total_per_op_us[GGML_OP_COUNT] = {0};