build-cov/
build-ci-debug/
build-ci-release/
+build-cublas/
out/
tmp/
models/
CMakeSettings.json
.vs/
.vscode/
+.clangd
.exrc
.cache
*.sw?
-__pycache__/
\ No newline at end of file
+__pycache__/
set(TEST_TARGET gpt-2-quantize)
add_executable(${TEST_TARGET} quantize.cpp)
target_link_libraries(${TEST_TARGET} PRIVATE ggml common common-ggml)
+
+#
+# For GPU offloading
+
+if (GGML_CUBLAS)
+ add_compile_definitions(GGML_USE_CUBLAS)
+endif()
+
+if (GGML_CLBLAST)
+ add_compile_definitions(GGML_USE_CLBLAST)
+endif()
+
+if (GGML_METAL)
+ add_compile_definitions(GGML_USE_METAL)
+endif()
#include "ggml/ggml.h"
#include "ggml/ggml-alloc.h"
+#include "ggml/ggml-backend.h"
+
+#ifdef GGML_USE_CUBLAS
+#include "ggml-cuda.h"
+#endif
+
+#ifdef GGML_USE_METAL
+#include "ggml-metal.h"
+#endif
#include "common.h"
#include "common-ggml.h"
#pragma warning(disable: 4244 4267) // possible loss of data
#endif
+static void ggml_log_callback_default(ggml_log_level level, const char * text, void * user_data) {
+ (void) level;
+ (void) user_data;
+ fputs(text, stderr);
+ fflush(stderr);
+}
+
// default hparams (GPT-2 117M)
struct gpt2_hparams {
int32_t n_vocab = 50257;
//
struct ggml_context * ctx;
+
+ ggml_backend_t backend = NULL;
+
+ ggml_backend_buffer_t buffer_w;
+ ggml_backend_buffer_t buffer_kv;
+
std::map<std::string, struct ggml_tensor *> tensors;
};
// load the model's weights from a file
-bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab) {
+bool gpt2_model_load(const std::string & fname, gpt2_model & model, gpt_vocab & vocab, int n_gpu_layers) {
printf("%s: loading model from '%s'\n", __func__, fname.c_str());
auto fin = std::ifstream(fname, std::ios::binary);
auto & ctx = model.ctx;
- size_t ctx_size = 0;
+ size_t buffer_size = 0;
{
const auto & hparams = model.hparams;
const int n_ctx = hparams.n_ctx;
const int n_vocab = hparams.n_vocab;
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
- ctx_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
+ buffer_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_g
+ buffer_size += n_embd*ggml_type_sizef(GGML_TYPE_F32); // ln_f_b
- ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
- ctx_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
- ctx_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
+ buffer_size += n_vocab*n_embd*ggml_type_sizef(wtype); // wte
+ buffer_size += n_ctx*n_embd*ggml_type_sizef(GGML_TYPE_F32); // wpe
+ buffer_size += n_vocab*n_embd*ggml_type_sizef(wtype); // lm_head
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
+ buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_g
+ buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_1_b
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
- ctx_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
+ buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_g
+ buffer_size += n_layer*(n_embd*ggml_type_sizef(GGML_TYPE_F32)); // ln_2_b
- ctx_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
- ctx_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
+ buffer_size += n_layer*(3*n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_attn_w
+ buffer_size += n_layer*( 3*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_attn_b
- ctx_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
+ buffer_size += n_layer*(n_embd*n_embd*ggml_type_sizef(wtype)); // c_attn_proj_w
+ buffer_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_attn_proj_b
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
- ctx_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
+ buffer_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_fc_w
+ buffer_size += n_layer*( 4*n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_fc_b
- ctx_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
- ctx_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
+ buffer_size += n_layer*(4*n_embd*n_embd*ggml_type_sizef(wtype)); // c_mlp_proj_w
+ buffer_size += n_layer*( n_embd*ggml_type_sizef(GGML_TYPE_F32)); // c_mlp_proj_b
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_k
- ctx_size += n_ctx*n_layer*n_embd*ggml_type_sizef(GGML_TYPE_F32); // memory_v
+ buffer_size += (6 + 12*n_layer)*128; // alignment overhead
- ctx_size += (6 + 12*n_layer)*512; // object overhead
-
- printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
- printf("%s: ggml ctx size = %6.2f MB\n", __func__, ctx_size/(1024.0*1024.0));
+ printf("%s: ggml tensor size = %d bytes\n", __func__, (int) sizeof(ggml_tensor));
+ printf("%s: backend buffer size = %6.2f MB\n", __func__, buffer_size/(1024.0*1024.0));
}
// create the ggml context
{
+ size_t n_tensors = 2 + 6 + 12*model.hparams.n_layer;
struct ggml_init_params params = {
- /*.mem_size =*/ ctx_size,
+ /*.mem_size =*/ ggml_tensor_overhead() * n_tensors,
/*.mem_buffer =*/ NULL,
- /*.no_alloc =*/ false,
+ /*.no_alloc =*/ true,
};
model.ctx = ggml_init(params);
}
}
+ // initialize the backend
+#ifdef GGML_USE_CUBLAS
+ if (n_gpu_layers > 0) {
+ fprintf(stderr, "%s: using CUDA backend\n", __func__);
+ model.backend = ggml_backend_cuda_init();
+ if (!model.backend) {
+ fprintf(stderr, "%s: ggml_backend_cuda_init() failed\n", __func__);
+ }
+ }
+#endif
+
+#ifdef GGML_USE_METAL
+ if (n_gpu_layers > 0) {
+ fprintf(stderr, "%s: using Metal backend\n", __func__);
+ ggml_metal_log_set_callback(ggml_log_callback_default, nullptr);
+ model.backend = ggml_backend_metal_init();
+ if (!model.backend) {
+ fprintf(stderr, "%s: ggml_backend_metal_init() failed\n", __func__);
+ }
+ }
+#endif
+
+ if (!model.backend) {
+ // fallback to CPU backend
+ fprintf(stderr, "%s: using CPU backend\n", __func__);
+ model.backend = ggml_backend_cpu_init();
+ }
+
+ if (!model.backend) {
+ fprintf(stderr, "%s: ggml_backend_cpu_init() failed\n", __func__);
+ return false;
+ }
+
+ // allocate weights buffer
+ model.buffer_w = ggml_backend_alloc_buffer(model.backend, buffer_size);
+
// prepare memory for the weights
{
const auto & hparams = model.hparams;
const size_t memory_size = ggml_nbytes(model.memory_k) + ggml_nbytes(model.memory_v);
printf("%s: memory size = %8.2f MB, n_mem = %d\n", __func__, memory_size/1024.0/1024.0, n_mem);
+
+ // create a backend buffer (can be in host or device memory)
+ model.buffer_kv = ggml_backend_alloc_buffer(model.backend, memory_size + 256);
+
+ // allocate the tensors into the backend buffer
+ {
+ ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_kv);
+
+ // this updates the pointers in the tensors to point to the correct location in the buffer
+ // this is necessary since the ggml_context is .no_alloc == true
+ // note that the buffer can actually be a device buffer, depending on the backend
+ ggml_allocr_alloc(alloc, model.memory_k);
+ ggml_allocr_alloc(alloc, model.memory_v);
+
+ ggml_allocr_free(alloc);
+ }
}
// load weights
{
+ ggml_allocr * alloc = ggml_allocr_new_from_buffer(model.buffer_w);
+
size_t total_size = 0;
bool has_lm_head = false;
+ std::vector<char> read_buf;
+
while (true) {
int32_t n_dims;
int32_t length;
}
auto tensor = model.tensors[name];
+ ggml_set_name(tensor, name.c_str());
if (ggml_nelements(tensor) != nelements) {
fprintf(stderr, "%s: tensor '%s' has wrong size in model file\n", __func__, name.c_str());
return false;
return false;
}
- fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+ ggml_allocr_alloc(alloc, tensor);
+
+ if (ggml_backend_is_cpu (model.backend)
+#ifdef GGML_USE_METAL
+ || ggml_backend_is_metal(model.backend)
+#endif
+ ) {
+ // for the CPU and Metal backend, we can read directly into the tensor
+ fin.read(reinterpret_cast<char *>(tensor->data), ggml_nbytes(tensor));
+ } else {
+ // read into a temporary buffer first, then copy to device memory
+ read_buf.resize(ggml_nbytes(tensor));
+ fin.read(read_buf.data(), ggml_nbytes(tensor));
+ ggml_backend_tensor_set(tensor, read_buf.data(), 0, ggml_nbytes(tensor));
+ }
// GPT-2 models share the WTE tensor as the LM head
if (name == "model/wte" && has_lm_head == false) {
- memcpy(model.lm_head->data, tensor->data, ggml_nbytes(tensor));
+ //ggml_allocr_alloc(alloc, model.lm_head);
+ //ggml_backend_tensor_copy(tensor, model.lm_head);
+ model.lm_head = tensor;
}
if (name == "model/lm_head") {
total_size += ggml_nbytes(tensor);
}
+ ggml_allocr_free(alloc);
printf("%s: model size = %8.2f MB\n", __func__, total_size/1024.0/1024.0);
}
// avoid writing to tensors if we are only measuring the memory usage
if (!ggml_allocr_is_measure(allocr)) {
- memcpy(embd->data, embd_inp.data(), N*ggml_element_size(embd));
+ ggml_backend_tensor_set(embd, embd_inp.data(), 0, N*ggml_element_size(embd));
}
struct ggml_tensor * position = ggml_new_tensor_1d(ctx0, GGML_TYPE_I32, N);
ggml_allocr_alloc(allocr, position);
if (!ggml_allocr_is_measure(allocr)) {
for (int i = 0; i < N; ++i) {
- ((int32_t *) position->data)[i] = n_past + i;
+ int32_t v = n_past + i;
+ ggml_backend_tensor_set(position, &v, i*sizeof(int32_t), sizeof(v));
}
}
struct ggml_tensor * KQ_scale = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, 1);
ggml_allocr_alloc(allocr, KQ_scale);
if (!ggml_allocr_is_measure(allocr)) {
- ggml_set_f32(KQ_scale, 1.0f/sqrtf(float(n_embd)/n_head));
+ float s = 1.0f/sqrtf(float(n_embd)/n_head);
+ ggml_backend_tensor_set(KQ_scale, &s, 0, sizeof(s));
}
// wte + wpe
// [ 768, N]
cur = ggml_add(ctx0,
ggml_mul(ctx0,
- ggml_repeat(ctx0, model.layers[il].ln_1_g, cur),
- cur),
- ggml_repeat(ctx0, model.layers[il].ln_1_b, cur));
+ cur,
+ model.layers[il].ln_1_g),
+ model.layers[il].ln_1_b);
}
// attn
cur);
cur = ggml_add(ctx0,
- ggml_repeat(ctx0, model.layers[il].c_attn_attn_b, cur),
- cur);
+ cur,
+ model.layers[il].c_attn_attn_b);
}
// self-attention
cur);
cur = ggml_add(ctx0,
- ggml_repeat(ctx0, model.layers[il].c_attn_proj_b, cur),
- cur);
+ cur,
+ model.layers[il].c_attn_proj_b);
}
// add the input
// [ 768, N]
cur = ggml_add(ctx0,
ggml_mul(ctx0,
- ggml_repeat(ctx0, model.layers[il].ln_2_g, cur),
- cur),
- ggml_repeat(ctx0, model.layers[il].ln_2_b, cur));
+ cur,
+ model.layers[il].ln_2_g),
+ model.layers[il].ln_2_b);
}
// fully connected
cur);
cur = ggml_add(ctx0,
- ggml_repeat(ctx0, model.layers[il].c_mlp_fc_b, cur),
- cur);
+ cur,
+ model.layers[il].c_mlp_fc_b);
// GELU activation
// [3072, N]
cur);
cur = ggml_add(ctx0,
- ggml_repeat(ctx0, model.layers[il].c_mlp_proj_b, cur),
- cur);
+ cur,
+ model.layers[il].c_mlp_proj_b);
}
// input for next layer
// [ 768, N]
inpL = ggml_add(ctx0,
ggml_mul(ctx0,
- ggml_repeat(ctx0, model.ln_f_g, inpL),
- inpL),
- ggml_repeat(ctx0, model.ln_f_b, inpL));
+ inpL,
+ model.ln_f_g),
+ model.ln_f_b);
}
// inpL = WTE * inpL
ggml_allocr_alloc_graph(allocr, gf);
// run the computation
- struct ggml_cplan plan = ggml_graph_plan(gf, n_threads);
- static std::vector<uint8_t> work_buffer;
- work_buffer.resize(plan.work_size);
- plan.work_data = work_buffer.data();
- ggml_graph_compute(gf, &plan);
+ if (ggml_backend_is_cpu(model.backend)) {
+ ggml_backend_cpu_set_n_threads(model.backend, n_threads);
+ }
+#ifdef GGML_USE_METAL
+ if (ggml_backend_is_metal(model.backend)) {
+ ggml_backend_metal_set_n_cb(model.backend, n_threads);
+ }
+#endif
+ ggml_backend_graph_compute(model.backend, gf);
//if (n_past%100 == 0) {
// ggml_graph_print (&gf);
struct ggml_tensor * inpL = gf->nodes[gf->n_nodes - 1];
//embd_w.resize(n_vocab*N);
- //memcpy(embd_w.data(), ggml_get_data(inpL), sizeof(float)*n_vocab*N);
+ //ggml_backend_tensor_get(inpL, embd_w.data(), 0, sizeof(float)*n_vocab*N);
// return result just for the last token
embd_w.resize(n_vocab);
- memcpy(embd_w.data(), (float *) ggml_get_data(inpL) + (n_vocab*(N-1)), sizeof(float)*n_vocab);
+ ggml_backend_tensor_get(inpL, embd_w.data(), (n_vocab*(N-1))*sizeof(float), sizeof(float)*n_vocab);
return true;
}
{
const int64_t t_start_us = ggml_time_us();
- if (!gpt2_model_load(params.model, model, vocab)) {
+ if (!gpt2_model_load(params.model, model, vocab, params.n_gpu_layers)) {
fprintf(stderr, "%s: failed to load model from '%s'\n", __func__, params.model.c_str());
return 1;
}
}
// keep this buffer alive while evaluating the model
- std::vector<uint8_t> compute_buffer;
+ ggml_backend_buffer_t buf_compute;
struct ggml_allocr * allocr = NULL;
// allocate the compute buffer
{
- allocr = ggml_allocr_new_measure(GGML_MEM_ALIGN);
+ // alignment required by the backend
+ size_t align = ggml_backend_get_alignment(model.backend);
+ allocr = ggml_allocr_new_measure(align);
// create the worst case graph for memory usage estimation
int n_tokens = std::min(model.hparams.n_ctx, params.n_batch);
struct ggml_cgraph * gf = gpt2_graph(model, allocr, n_past, std::vector<gpt_vocab::id>(n_tokens, 0));
// compute the required memory
- size_t mem_size = ggml_allocr_alloc_graph(allocr, gf) + GGML_MEM_ALIGN;
+ size_t mem_size = ggml_allocr_alloc_graph(allocr, gf);
// recreate the allocator with the required memory
ggml_allocr_free(allocr);
- compute_buffer.resize(mem_size);
- allocr = ggml_allocr_new(compute_buffer.data(), mem_size, GGML_MEM_ALIGN);
+ buf_compute = ggml_backend_alloc_buffer(model.backend, mem_size);
+ allocr = ggml_allocr_new_from_buffer(buf_compute);
fprintf(stderr, "%s: compute buffer size: %.2f MB\n", __func__, mem_size/1024.0/1024.0);
}
ggml_free(model.ctx);
+ ggml_backend_buffer_free(model.buffer_w);
+ ggml_backend_buffer_free(model.buffer_kv);
+ ggml_backend_buffer_free(buf_compute);
+ ggml_backend_free(model.backend);
+
return 0;
}
extern "C" {
#endif
+struct ggml_backend_buffer;
GGML_API struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment);
GGML_API struct ggml_allocr * ggml_allocr_new_measure(size_t alignment);
+GGML_API struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer);
// tell the allocator to parse nodes following the order described in the list
// you should call this if your graph are optimized to execute out-of-order
GGML_API void ggml_allocr_set_parse_seq(struct ggml_allocr * alloc, const int * list, int n);
-GGML_API void ggml_allocr_free(struct ggml_allocr * alloc);
-GGML_API bool ggml_allocr_is_measure(struct ggml_allocr * alloc);
-GGML_API void ggml_allocr_reset(struct ggml_allocr * alloc);
-GGML_API void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor);
+GGML_API void ggml_allocr_free (struct ggml_allocr * alloc);
+GGML_API bool ggml_allocr_is_measure (struct ggml_allocr * alloc);
+GGML_API void ggml_allocr_reset (struct ggml_allocr * alloc);
+GGML_API void ggml_allocr_alloc (struct ggml_allocr * alloc, struct ggml_tensor * tensor);
GGML_API size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph);
-GGML_API size_t ggml_allocr_max_size(struct ggml_allocr * alloc);
+GGML_API size_t ggml_allocr_max_size (struct ggml_allocr * alloc);
+GGML_API size_t ggml_allocr_alloc_graph_n(
+ struct ggml_allocr * alloc,
+ struct ggml_cgraph ** graphs, int n_graphs,
+ struct ggml_tensor *** inputs, struct ggml_tensor *** outputs);
#ifdef __cplusplus
}
--- /dev/null
+#pragma once
+
+#include "ggml.h"
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+ struct ggml_backend;
+ struct ggml_backend_buffer;
+
+ // type-erased backend-specific types / wrappers
+ typedef void * ggml_backend_context_t;
+ typedef void * ggml_backend_graph_plan_t;
+ typedef void * ggml_backend_buffer_context_t;
+
+ // avoid accessing internals of these types
+ typedef struct ggml_backend * ggml_backend_t;
+ typedef struct ggml_backend_buffer * ggml_backend_buffer_t;
+
+ //
+ // backend buffer
+ //
+
+ struct ggml_backend_buffer_i {
+ void (*free_buffer) (ggml_backend_buffer_t buffer);
+ void * (*get_base) (ggml_backend_buffer_t buffer); // get base pointer
+ size_t (*get_alloc_size)(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-allocation callback
+ void (*init_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // post-allocation callback
+ void (*free_tensor) (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor); // pre-free callback
+ };
+
+ // TODO: hide behind API
+ struct ggml_backend_buffer {
+ struct ggml_backend_buffer_i interface;
+
+ ggml_backend_t backend;
+ ggml_backend_buffer_context_t context;
+
+ size_t size;
+ };
+
+ // backend buffer functions
+ GGML_API ggml_backend_buffer_t ggml_backend_buffer_init(
+ struct ggml_backend * backend,
+ struct ggml_backend_buffer_i interface,
+ ggml_backend_buffer_context_t context,
+ size_t size);
+
+ GGML_API void ggml_backend_buffer_free (ggml_backend_buffer_t buffer);
+ GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+ GGML_API void * ggml_backend_buffer_get_base (ggml_backend_buffer_t buffer);
+ GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
+ GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+ GGML_API void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+ GGML_API void ggml_backend_buffer_free_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
+
+ //
+ // backend
+ //
+
+ struct ggml_backend_i {
+ const char * (*get_name)(ggml_backend_t backend);
+
+ void (*free)(ggml_backend_t backend);
+
+ // buffer allocation
+ ggml_backend_buffer_t (*alloc_buffer)(ggml_backend_t backend, size_t size);
+
+ // get buffer alignment
+ size_t (*get_alignment)(ggml_backend_t backend);
+
+ // tensor data access
+ // these functions can be asynchronous, helper functions are provided for synchronous access that automatically call synchronize
+ void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+ void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
+ void (*synchronize) (ggml_backend_t backend);
+
+ // (optional) copy tensor between different backends, allow for single-copy tranfers
+ void (*cpy_tensor_from)(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+ void (*cpy_tensor_to) (ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst);
+
+ // compute graph with a plan
+ ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+ void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+ void (*graph_plan_compute)(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+
+ // compute graph without a plan
+ void (*graph_compute)(ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+ // check if the backend supports an operation
+ bool (*supports_op)(ggml_backend_t backend, const struct ggml_tensor * op);
+ };
+
+ // TODO: hide behind API
+ struct ggml_backend {
+ struct ggml_backend_i interface;
+
+ ggml_backend_context_t context;
+ };
+
+ // backend helper functions
+ GGML_API ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor);
+
+ GGML_API const char * ggml_backend_name(ggml_backend_t backend);
+ GGML_API void ggml_backend_free(ggml_backend_t backend);
+
+ GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
+
+ GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
+
+ GGML_API void ggml_backend_tensor_set_async( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+ GGML_API void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
+
+ GGML_API void ggml_backend_tensor_set( struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
+ GGML_API void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
+
+ GGML_API void ggml_backend_synchronize(ggml_backend_t backend);
+
+ GGML_API ggml_backend_graph_plan_t ggml_backend_graph_plan_create (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+
+ GGML_API void ggml_backend_graph_plan_free (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+ GGML_API void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan);
+ GGML_API void ggml_backend_graph_compute (ggml_backend_t backend, struct ggml_cgraph * cgraph);
+ GGML_API bool ggml_backend_supports_op (ggml_backend_t backend, const struct ggml_tensor * op);
+
+ // tensor copy between different backends
+ GGML_API void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst);
+
+ //
+ // CPU backend
+ //
+
+ GGML_API ggml_backend_t ggml_backend_cpu_init(void);
+
+ GGML_API bool ggml_backend_is_cpu(ggml_backend_t backend);
+
+ GGML_API void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads);
+
+ GGML_API ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size);
+
+#ifdef __cplusplus
+}
+#endif
GGML_TYPE_COUNT,
};
- enum ggml_backend {
+ enum ggml_backend_type {
GGML_BACKEND_CPU = 0,
GGML_BACKEND_GPU = 10,
GGML_BACKEND_GPU_SPLIT = 20,
// n-dimensional tensor
struct ggml_tensor {
- enum ggml_type type;
- enum ggml_backend backend;
+ enum ggml_type type;
+ enum ggml_backend_type backend;
+
+ struct ggml_backend_buffer * buffer;
int n_dims;
int64_t ne[GGML_MAX_DIMS]; // number of elements
void * extra; // extra things e.g. for ggml-cuda.cu
- char padding[4];
+ char padding[12];
};
static const size_t GGML_TENSOR_SIZE = sizeof(struct ggml_tensor);
set(GGML_EXTRA_LIBS ${GGML_EXTRA_LIBS} CUDA::cudart CUDA::cublas CUDA::cublasLt)
endif()
+ if (CMAKE_BUILD_TYPE MATCHES Debug)
+ set(CMAKE_CUDA_FLAGS "${CMAKE_CUDA_FLAGS} -lineinfo")
+ endif()
else()
message(WARNING "cuBLAS not found")
endif()
set(GGML_METAL_SOURCES ggml-metal.m ggml-metal.h)
add_compile_definitions(GGML_USE_METAL)
- add_compile_definitions(GGML_METAL_NDEBUG)
+ #add_compile_definitions(GGML_METAL_NDEBUG)
# get full path to the file
#add_compile_definitions(GGML_METAL_DIR_KERNELS="${CMAKE_CURRENT_SOURCE_DIR}/")
add_library(${TARGET}
ggml.c
ggml-alloc.c
+ ggml-backend.c
../include/ggml/ggml.h
../include/ggml/ggml-alloc.h
+ ../include/ggml/ggml-backend.h
${GGML_CUDA_SOURCES}
${GGML_OPENCL_SOURCES}
${GGML_METAL_SOURCES}
if (GGML_CUDA_SOURCES)
message(STATUS "GGML CUDA sources found, configuring CUDA architecture")
- set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "52;61")
+ set_property(TARGET ggml PROPERTY CUDA_ARCHITECTURES "52;61;70")
set_property(TARGET ggml PROPERTY CUDA_SELECT_NVCC_ARCH_FLAGS "Auto")
if (NOT MSVC)
target_link_libraries(ggml PUBLIC stdc++)
#include "ggml-alloc.h"
+#include "ggml-backend.h"
#include "ggml.h"
#include <assert.h>
#include <stdarg.h>
#include <stdlib.h>
#include <string.h>
-#ifdef __has_include
- #if __has_include(<unistd.h>)
- #include <unistd.h>
- #if defined(_POSIX_MAPPED_FILES)
- #include <sys/types.h>
- #include <sys/mman.h>
- #endif
- #endif
-#endif
-
-#if defined(_WIN32)
- #define WIN32_LEAN_AND_MEAN
- #ifndef NOMINMAX
- #define NOMINMAX
- #endif
- #include <windows.h>
- #include <memoryapi.h>
-#endif
-
#define UNUSED(x) (void)(x)
#define MAX(a, b) ((a) > (b) ? (a) : (b))
#define MAX_FREE_BLOCKS 256
struct ggml_allocr {
+ struct ggml_backend_buffer * buffer;
+ bool buffer_owned;
void * data;
- size_t size;
size_t alignment;
int n_free_blocks;
struct free_block free_blocks[MAX_FREE_BLOCKS];
}
#endif
-static size_t ggml_allocr_get_alloc_size(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
- return ggml_nbytes(tensor);
-
- UNUSED(alloc);
-}
-
// check if a tensor is allocated by this buffer
static bool ggml_allocr_is_own(struct ggml_allocr * alloc, const struct ggml_tensor * tensor) {
- void * ptr = tensor->data;
- return ptr >= alloc->data && (char *)ptr < (char *)alloc->data + alloc->max_size;
+ return tensor->buffer == alloc->buffer;
}
static bool ggml_is_view(struct ggml_tensor * t) {
}
void ggml_allocr_alloc(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
-#ifdef GGML_ALLOCATOR_DEBUG
GGML_ASSERT(!ggml_is_view(tensor)); // views generally get data pointer from one of their sources
GGML_ASSERT(tensor->data == NULL); // avoid allocating tensor which already has memory allocated
-#endif
- size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
+
+ size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: allocating %s (%zu bytes) - ", __func__, tensor->name, size);
tensor->data = addr;
AT_PRINTF("%s: allocated data at %p\n", __func__, tensor->data);
+ tensor->buffer = alloc->buffer;
+ ggml_backend_buffer_init_tensor(alloc->buffer, tensor);
#ifdef GGML_ALLOCATOR_DEBUG
add_allocated_tensor(alloc, tensor);
// this is a very naive implementation, but for our case the number of free blocks should be very small
static void ggml_allocr_free_tensor(struct ggml_allocr * alloc, struct ggml_tensor * tensor) {
- void * ptr = tensor->data;
-
if (ggml_allocr_is_own(alloc, tensor) == false) {
// the tensor was not allocated in this buffer
// this can happen because the graph allocator will try to free weights and other tensors from different buffers
// the easiest way to deal with this is just to ignore it
+ AT_PRINTF("ignoring %s (their buffer: %p, our buffer: %p)\n", tensor->name, tensor->buffer, alloc->buffer);
return;
}
- size_t size = ggml_allocr_get_alloc_size(alloc, tensor);
+ size_t size = ggml_backend_buffer_get_alloc_size(alloc->buffer, tensor);
size = aligned_offset(NULL, size, alloc->alignment);
AT_PRINTF("%s: freeing %s at %p (%zu bytes) - n_free_blocks = %d\n", __func__, tensor->name, ptr, size, alloc->n_free_blocks);
AT_PRINTF("%s: alloc->data = %p alloc->data+alloc->size = %p alloc->data+alloc->max_size = %p\n", __func__, alloc->data, (char*)alloc->data + alloc->size, (char*)alloc->data + alloc->max_size);
+ ggml_backend_buffer_free_tensor(alloc->buffer, tensor);
+
#ifdef GGML_ALLOCATOR_DEBUG
remove_allocated_tensor(alloc, tensor);
#endif
+ void * ptr = tensor->data;
+
// see if we can merge with an existing block
for (int i = 0; i < alloc->n_free_blocks; i++) {
struct free_block * block = &alloc->free_blocks[i];
alloc->n_free_blocks = 1;
size_t align_offset = aligned_offset(alloc->data, 0, alloc->alignment);
alloc->free_blocks[0].addr = (char *)alloc->data + align_offset;
- alloc->free_blocks[0].size = alloc->size - align_offset;
+ alloc->free_blocks[0].size = ggml_backend_buffer_get_size(alloc->buffer) - align_offset;
}
struct ggml_allocr * ggml_allocr_new(void * data, size_t size, size_t alignment) {
+ struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, data, size);
+
+ struct ggml_allocr * alloc = ggml_allocr_new_from_buffer(buffer);
+ alloc->alignment = alignment;
+ alloc->buffer_owned = true;
+
+ return alloc;
+}
+
+struct ggml_allocr * ggml_allocr_new_from_buffer(struct ggml_backend_buffer * buffer) {
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
*alloc = (struct ggml_allocr){
- /*.data = */ data,
- /*.size = */ size,
- /*.alignment = */ alignment,
+ /*.buffer = */ buffer,
+ /*.buffer_owned = */ false,
+ /*.base = */ ggml_backend_buffer_get_base(buffer),
+ /*.alignment = */ ggml_backend_buffer_get_alignment(buffer),
/*.n_free_blocks = */ 0,
/*.free_blocks = */ {{0}},
/*.hash_table = */ {{0}},
return alloc;
}
-// OS specific functions to allocate and free uncommitted virtual memory
-static void * alloc_vmem(size_t size) {
-#if defined(_WIN32)
- return VirtualAlloc(NULL, size, MEM_RESERVE, PAGE_NOACCESS);
-#elif defined(_POSIX_MAPPED_FILES)
- void * ptr = mmap(NULL, size, PROT_NONE, MAP_PRIVATE | MAP_ANON, -1, 0);
- if (ptr == MAP_FAILED) {
- return NULL;
- }
- return ptr;
-#else
- // use a fixed address for other platforms
- uintptr_t base_addr = (uintptr_t)-size - 0x100;
- return (void *)base_addr;
-#endif
-}
-
-static void free_vmem(void * base_addr, size_t size) {
-#if defined(_WIN32)
- VirtualFree(base_addr, 0, MEM_RELEASE);
- UNUSED(size);
-#elif defined(_POSIX_MAPPED_FILES)
- munmap(base_addr, size);
-#else
- // nothing to do
- UNUSED(base_addr);
- UNUSED(size);
-#endif
-}
-
-// allocate uncommitted virtual memory to measure the size of the graph
-static void alloc_measure_vmem(void ** base_addr, size_t * size) {
- // 128GB for 64-bit, 1GB for 32-bit
- *size = sizeof(void *) == 4 ? 1ULL<<30 : 1ULL<<37;
- do {
- *base_addr = alloc_vmem(*size);
- if (*base_addr != NULL) {
- AT_PRINTF("allocated %.2f GB of virtual memory for measure buffer at %p\n", *size / 1024.0 / 1024.0 / 1024.0, *base_addr);
- return;
- }
- // try again with half the size
- *size /= 2;
- } while (*size > 0);
-
- GGML_ASSERT(!"failed to allocate virtual memory for measure buffer");
-}
-
-static void free_measure_vmem(void * base_addr, size_t size) {
- free_vmem(base_addr, size);
-}
-
struct ggml_allocr * ggml_allocr_new_measure(size_t alignment) {
struct ggml_allocr * alloc = (struct ggml_allocr *)malloc(sizeof(struct ggml_allocr) /* + n_free_blocks * sizeof(struct free_block) */);
- void * base_addr;
- size_t size;
-
- alloc_measure_vmem(&base_addr, &size);
+ struct ggml_backend_buffer * buffer = ggml_backend_cpu_buffer_from_ptr(NULL, (void *)0x1000, (size_t)-0x1001);
*alloc = (struct ggml_allocr){
- /*.data = */ base_addr,
- /*.size = */ size,
+ /*.buffer = */ buffer,
+ /*.buffer_owned = */ true,
+ /*.base = */ ggml_backend_buffer_get_base(buffer),
/*.alignment = */ alignment,
/*.n_free_blocks = */ 0,
/*.free_blocks = */ {{0}},
}
void ggml_allocr_free(struct ggml_allocr * alloc) {
- if (alloc->measure) {
- free_measure_vmem(alloc->data, alloc->size);
+ if (alloc->buffer_owned) {
+ ggml_backend_buffer_free(alloc->buffer);
}
free(alloc);
}
case GGML_OP_ROPE:
case GGML_OP_RMS_NORM:
case GGML_OP_SOFT_MAX:
- case GGML_OP_CONT:
return true;
default:
}
}
+static void init_view(struct ggml_allocr * alloc, struct ggml_tensor * view) {
+ assert(view->view_src != NULL && view->view_src->data != NULL);
+ view->backend = view->view_src->backend;
+ view->buffer = view->view_src->buffer;
+ view->data = (char *)view->view_src->data + view->view_offs;
+
+ // FIXME: the view should be initialized by the owning buffer, but currently this breaks the CUDA backend
+ // due to the ggml_tensor_extra_gpu ring buffer overwriting the KV cache extras
+ assert(ggml_allocr_is_measure(alloc) || view->buffer->backend == alloc->buffer->backend);
+ ggml_backend_buffer_init_tensor(alloc->buffer, view);
+}
+
static void allocate_node(struct ggml_allocr * alloc, struct ggml_tensor * node) {
struct hash_node * ht = alloc->hash_table;
if (node->data == NULL) {
if (ggml_is_view(node)) {
- assert(node->view_src->data != NULL);
- node->data = (char *)node->view_src->data + node->view_offs;
+ init_view(alloc, node);
} else {
// see if we can reuse a parent's buffer (inplace)
if (ggml_op_can_inplace(node->op)) {
// adding a view_src pointer to the tensor would solve this and simplify the code dealing with views
// for now, we only reuse the parent's data if the offset is zero (view_src->data == parent->data)
AT_PRINTF("reusing view parent %s (%s) for %s\n", parent->name, view_src->name, node->name);
- node->data = parent->data;
+ node->view_src = parent;
+ init_view(alloc, node);
return;
}
}
else {
AT_PRINTF("reusing parent %s for %s\n", parent->name, node->name);
- node->data = parent->data;
+ node->view_src = parent;
+ init_view(alloc, node);
return;
}
}
}
}
-static size_t ggml_allocr_alloc_graph_tensors_n(
+size_t ggml_allocr_alloc_graph_n(
struct ggml_allocr * alloc,
struct ggml_cgraph ** graphs, int n_graphs,
struct ggml_tensor *** inputs, struct ggml_tensor *** outputs) {
if (ggml_is_view(node)) {
struct ggml_tensor * view_src = node->view_src;
hash_get(ht, view_src)->n_views += 1;
+ if (node->buffer == NULL && node->data != NULL) {
+ // view of a pre-allocated tensor, didn't call init_view() yet
+ init_view(alloc, node);
+ }
}
for (int j = 0; j < GGML_MAX_SRC; j++) {
break;
}
hash_get(ht, parent)->n_children += 1;
+ if (ggml_is_view(parent) && parent->buffer == NULL && parent->data != NULL) {
+ init_view(alloc, parent);
+ }
}
}
}
}
size_t ggml_allocr_alloc_graph(struct ggml_allocr * alloc, struct ggml_cgraph * graph) {
- return ggml_allocr_alloc_graph_tensors_n(alloc, &graph, 1, NULL, NULL);
+ return ggml_allocr_alloc_graph_n(alloc, &graph, 1, NULL, NULL);
}
size_t ggml_allocr_max_size(struct ggml_allocr * alloc) {
--- /dev/null
+#include "ggml-backend.h"
+#include "ggml-alloc.h"
+
+#include <assert.h>
+#include <stdarg.h>
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+
+#define UNUSED GGML_UNUSED
+
+#define MAX(a, b) ((a) > (b) ? (a) : (b))
+
+// backend buffer
+
+ggml_backend_buffer_t ggml_backend_buffer_init(
+ struct ggml_backend * backend,
+ struct ggml_backend_buffer_i interface,
+ ggml_backend_buffer_context_t context,
+ size_t size) {
+ ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
+
+ GGML_ASSERT(interface.get_base != NULL);
+
+ (*buffer) = (struct ggml_backend_buffer) {
+ /* .interface = */ interface,
+ /* .backend = */ backend,
+ /* .context = */ context,
+ /* .size = */ size,
+ };
+
+ return buffer;
+}
+
+void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
+ if (buffer->interface.free_buffer != NULL) {
+ buffer->interface.free_buffer(buffer);
+ }
+ free(buffer);
+}
+
+size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
+ return ggml_backend_get_alignment(buffer->backend);
+}
+
+void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
+ return buffer->interface.get_base(buffer);
+}
+
+size_t ggml_backend_buffer_get_size(ggml_backend_buffer_t buffer) {
+ return buffer->size;
+}
+
+size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+ if (buffer->interface.get_alloc_size) {
+ return buffer->interface.get_alloc_size(buffer, tensor);
+ }
+ return ggml_nbytes(tensor);
+}
+
+void ggml_backend_buffer_init_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+ if (buffer->interface.init_tensor) {
+ buffer->interface.init_tensor(buffer, tensor);
+ }
+}
+
+void ggml_backend_buffer_free_tensor(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
+ if (buffer->interface.free_tensor) {
+ buffer->interface.free_tensor(buffer, tensor);
+ }
+}
+
+// backend
+
+ggml_backend_t ggml_get_backend(const struct ggml_tensor * tensor) {
+ return tensor->buffer->backend;
+}
+
+const char * ggml_backend_name(ggml_backend_t backend) {
+ return backend->interface.get_name(backend);
+}
+
+void ggml_backend_free(ggml_backend_t backend) {
+ backend->interface.free(backend);
+}
+
+ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
+ return backend->interface.alloc_buffer(backend, size);
+}
+
+size_t ggml_backend_get_alignment(ggml_backend_t backend) {
+ return backend->interface.get_alignment(backend);
+}
+
+void ggml_backend_tensor_set_async(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+ ggml_get_backend(tensor)->interface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
+}
+
+void ggml_backend_tensor_get_async(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+ ggml_get_backend(tensor)->interface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
+}
+
+void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+ ggml_get_backend(tensor)->interface.set_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
+ ggml_get_backend(tensor)->interface.synchronize(ggml_get_backend(tensor));
+}
+
+void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+ ggml_get_backend(tensor)->interface.get_tensor_async(ggml_get_backend(tensor), tensor, data, offset, size);
+ ggml_get_backend(tensor)->interface.synchronize(ggml_get_backend(tensor));
+}
+
+void ggml_backend_synchronize(ggml_backend_t backend) {
+ backend->interface.synchronize(backend);
+}
+
+ggml_backend_graph_plan_t ggml_backend_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+ return backend->interface.graph_plan_create(backend, cgraph);
+}
+
+void ggml_backend_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+ backend->interface.graph_plan_free(backend, plan);
+}
+
+void ggml_backend_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+ backend->interface.graph_plan_compute(backend, plan);
+}
+
+void ggml_backend_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+ backend->interface.graph_compute(backend, cgraph);
+}
+
+bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+ return backend->interface.supports_op(backend, op);
+}
+
+// backend copy
+
+static bool ggml_are_same_layout(const struct ggml_tensor * a, const struct ggml_tensor * b) {
+ if (a->type != b->type) {
+ return false;
+ }
+ for (int i = 0; i < GGML_MAX_DIMS; i++) {
+ if (a->ne[i] != b->ne[i]) {
+ return false;
+ }
+ if (a->nb[i] != b->nb[i]) {
+ return false;
+ }
+ }
+ return true;
+}
+
+void ggml_backend_tensor_copy(struct ggml_tensor * src, struct ggml_tensor * dst) {
+ //printf("src: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", src->name, (int)src->ne[0], (int)src->ne[1], (int)src->ne[2], (int)src->ne[3], (int)src->nb[0], (int)src->nb[1], (int)src->nb[2], (int)src->nb[3]);
+ //printf("dst: %s ne: [%d %d %d %d] nb: [%d %d %d %d]\n", dst->name, (int)dst->ne[0], (int)dst->ne[1], (int)dst->ne[2], (int)dst->ne[3], (int)dst->nb[0], (int)dst->nb[1], (int)dst->nb[2], (int)dst->nb[3]);
+ GGML_ASSERT(ggml_are_same_layout(src, dst) && "cannot copy tensors with different layouts");
+
+ // printf("cpy tensor %s from %s to %s (%lu bytes)\n", src->name, ggml_backend_name(src->backend), ggml_backend_name(dst->backend), ggml_nbytes(src));
+
+ if (src == dst) {
+ return;
+ }
+
+ // TODO: allow backends to support copy to/from same backend
+
+ if (ggml_get_backend(dst)->interface.cpy_tensor_from != NULL) {
+ ggml_get_backend(dst)->interface.cpy_tensor_from(ggml_get_backend(dst)->context, src, dst);
+ } else if (ggml_get_backend(src)->interface.cpy_tensor_to != NULL) {
+ ggml_get_backend(src)->interface.cpy_tensor_to(ggml_get_backend(src)->context, src, dst);
+ } else {
+ // shouldn't be hit when copying from/to CPU
+ #ifndef NDEBUG
+ fprintf(stderr, "ggml_backend_tensor_copy: neither cpy_tensor_from nor cpy_tensor_to are implemented for backends %s and %s, falling back to get/set\n", ggml_backend_name(src->buffer->backend), ggml_backend_name(dst->buffer->backend));
+ #endif
+ size_t nbytes = ggml_nbytes(src);
+ void * data = malloc(nbytes);
+ ggml_backend_tensor_get(src, data, 0, nbytes);
+ ggml_backend_tensor_set(dst, data, 0, nbytes);
+ free(data);
+ }
+}
+
+// backend CPU
+
+struct ggml_backend_cpu_context {
+ int n_threads;
+ void * work_data;
+ size_t work_size;
+};
+
+static const char * ggml_backend_cpu_name(ggml_backend_t backend) {
+ return "CPU";
+
+ UNUSED(backend);
+}
+
+static void ggml_backend_cpu_free(ggml_backend_t backend) {
+ struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+ free(cpu_ctx->work_data);
+ free(cpu_ctx);
+ free(backend);
+}
+
+static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
+ return (void *)buffer->context;
+}
+
+static void ggml_backend_cpu_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+ free(buffer->context);
+ UNUSED(buffer);
+}
+
+static struct ggml_backend_buffer_i cpu_backend_buffer_i = {
+ /* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
+ /* .get_base = */ ggml_backend_cpu_buffer_get_base,
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
+ /* .init_tensor = */ NULL, // no initialization required
+ /* .free_tensor = */ NULL, // no cleanup required
+};
+
+// for buffers from ptr, free is not called
+static struct ggml_backend_buffer_i cpu_backend_buffer_i_from_ptr = {
+ /* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
+ /* .get_base = */ ggml_backend_cpu_buffer_get_base,
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
+ /* .init_tensor = */ NULL,
+ /* .free_tensor = */ NULL,
+};
+
+static const size_t TENSOR_ALIGNMENT = 64; // should be enough for AVX 512
+
+static ggml_backend_buffer_t ggml_backend_cpu_alloc_buffer(ggml_backend_t backend, size_t size) {
+ size += TENSOR_ALIGNMENT; // malloc may return an address that is not aligned
+ void * data = malloc(size); // TODO: maybe use GGML_ALIGNED_MALLOC?
+
+ return ggml_backend_buffer_init(backend, cpu_backend_buffer_i, data, size);
+}
+
+static size_t ggml_backend_cpu_get_alignment(ggml_backend_t backend) {
+ return TENSOR_ALIGNMENT;
+ UNUSED(backend);
+}
+
+static void ggml_backend_cpu_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+ memcpy((char *)tensor->data + offset, data, size);
+
+ UNUSED(backend);
+}
+
+static void ggml_backend_cpu_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+ memcpy(data, (const char *)tensor->data + offset, size);
+
+ UNUSED(backend);
+}
+
+static void ggml_backend_cpu_synchronize(ggml_backend_t backend) {
+ UNUSED(backend);
+}
+
+static void ggml_backend_cpu_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+ ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
+
+ UNUSED(backend);
+}
+
+static void ggml_backend_cpu_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+ // for a backend such as CUDA that can queue async calls, it is ok to do this asynchronously, but it may not be the case for other backends
+ ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
+
+ UNUSED(backend);
+}
+
+struct ggml_backend_plan_cpu {
+ struct ggml_cplan cplan;
+ struct ggml_cgraph cgraph;
+};
+
+static ggml_backend_graph_plan_t ggml_backend_cpu_graph_plan_create(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+ struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+
+ struct ggml_backend_plan_cpu * cpu_plan = malloc(sizeof(struct ggml_backend_plan_cpu));
+
+ cpu_plan->cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+ cpu_plan->cgraph = *cgraph;
+
+ if (cpu_plan->cplan.work_size > 0) {
+ cpu_plan->cplan.work_data = malloc(cpu_plan->cplan.work_size);
+ }
+
+ return cpu_plan;
+}
+
+static void ggml_backend_cpu_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+ struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
+
+ free(cpu_plan->cplan.work_data);
+ free(cpu_plan);
+
+ UNUSED(backend);
+}
+
+static void ggml_backend_cpu_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+ struct ggml_backend_plan_cpu * cpu_plan = (struct ggml_backend_plan_cpu *)plan;
+
+ ggml_graph_compute(&cpu_plan->cgraph, &cpu_plan->cplan);
+
+ UNUSED(backend);
+}
+
+static void ggml_backend_cpu_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+ struct ggml_backend_cpu_context * cpu_ctx = (struct ggml_backend_cpu_context *)backend->context;
+
+ struct ggml_cplan cplan = ggml_graph_plan(cgraph, cpu_ctx->n_threads);
+
+ if (cpu_ctx->work_size < cplan.work_size) {
+ // TODO: may be faster to free and use malloc to avoid the copy
+ cpu_ctx->work_data = realloc(cpu_ctx->work_data, cplan.work_size);
+ cpu_ctx->work_size = cplan.work_size;
+ }
+
+ cplan.work_data = cpu_ctx->work_data;
+
+ ggml_graph_compute(cgraph, &cplan);
+}
+
+static bool ggml_backend_cpu_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+ return true;
+ UNUSED(backend);
+ UNUSED(op);
+}
+
+static struct ggml_backend_i cpu_backend_i = {
+ /* .get_name = */ ggml_backend_cpu_name,
+ /* .free = */ ggml_backend_cpu_free,
+ /* .alloc_buffer = */ ggml_backend_cpu_alloc_buffer,
+ /* .get_alignment = */ ggml_backend_cpu_get_alignment,
+ /* .set_tensor_async = */ ggml_backend_cpu_set_tensor_async,
+ /* .get_tensor_async = */ ggml_backend_cpu_get_tensor_async,
+ /* .synchronize = */ ggml_backend_cpu_synchronize,
+ /* .cpy_tensor_from = */ ggml_backend_cpu_cpy_tensor_from,
+ /* .cpy_tensor_to = */ ggml_backend_cpu_cpy_tensor_to,
+ /* .graph_plan_create = */ ggml_backend_cpu_graph_plan_create,
+ /* .graph_plan_free = */ ggml_backend_cpu_graph_plan_free,
+ /* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
+ /* .graph_compute = */ ggml_backend_cpu_graph_compute,
+ /* .supports_op = */ ggml_backend_cpu_supports_op,
+};
+
+ggml_backend_t ggml_backend_cpu_init(void) {
+ struct ggml_backend_cpu_context * ctx = malloc(sizeof(struct ggml_backend_cpu_context));
+
+ ctx->n_threads = GGML_DEFAULT_N_THREADS;
+ ctx->work_data = NULL;
+ ctx->work_size = 0;
+
+ ggml_backend_t cpu_backend = malloc(sizeof(struct ggml_backend));
+
+ *cpu_backend = (struct ggml_backend) {
+ /* .interface = */ cpu_backend_i,
+ /* .context = */ ctx
+ };
+ return cpu_backend;
+}
+
+bool ggml_backend_is_cpu(ggml_backend_t backend) {
+ return backend->interface.get_name == ggml_backend_cpu_name;
+}
+
+void ggml_backend_cpu_set_n_threads(ggml_backend_t backend_cpu, int n_threads) {
+ GGML_ASSERT(ggml_backend_is_cpu(backend_cpu));
+
+ struct ggml_backend_cpu_context * ctx = (struct ggml_backend_cpu_context *)backend_cpu->context;
+ ctx->n_threads = n_threads;
+}
+
+ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(ggml_backend_t backend_cpu, void * ptr, size_t size) {
+ return ggml_backend_buffer_init(backend_cpu, cpu_backend_buffer_i_from_ptr, ptr, size);
+}
#define cudaMemcpyHostToDevice hipMemcpyHostToDevice
#define cudaMemcpyKind hipMemcpyKind
#define cudaMemset hipMemset
+#define cudaMemsetAsync hipMemsetAsync
#define cudaOccupancyMaxPotentialBlockSize hipOccupancyMaxPotentialBlockSize
#define cudaSetDevice hipSetDevice
#define cudaStreamCreateWithFlags hipStreamCreateWithFlags
#define CUDA_DIAG_MASK_INF_BLOCK_SIZE 32
#define CUDA_QUANTIZE_BLOCK_SIZE 256
#define CUDA_DEQUANTIZE_BLOCK_SIZE 256
+#define CUDA_GET_ROWS_BLOCK_SIZE 256
// dmmv = dequantize_mul_mat_vec
#ifndef GGML_CUDA_DMMV_X
reinterpret_cast<half&>(y[ib].ds.y) = sum;
}
+template<int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
+static __global__ void k_get_rows(const void * x, const int32_t * y, dst_t * dst, const int ncols) {
+ const int col = (blockIdx.x*blockDim.x + threadIdx.x)*2;
+ const int row = blockDim.y*blockIdx.y + threadIdx.y;
+
+ if (col >= ncols) {
+ return;
+ }
+
+ const int r = y[row];
+
+ // copy x[r*ncols + col] to dst[row*ncols + col]
+ const int xi = r*ncols + col;
+ const int di = row*ncols + col;
+
+ const int ib = xi/qk; // block index
+ const int iqs = (xi%qk)/qr; // quant index
+ const int iybs = di - di%qk; // y block start index
+ const int y_offset = qr == 1 ? 1 : qk/2;
+
+ // dequantize
+ dfloat2 v;
+ dequantize_kernel(x, ib, iqs, v);
+
+ dst[iybs + iqs + 0] = v.x;
+ dst[iybs + iqs + y_offset] = v.y;
+}
+
template <int qk, int qr, dequantize_kernel_t dequantize_kernel, typename dst_t>
static __global__ void dequantize_block(const void * __restrict__ vx, dst_t * __restrict__ y, const int k) {
const int i = blockDim.x*blockIdx.x + 2*threadIdx.x;
dst[i] = scale * x[i];
}
+
+template<int qk, int qr, dequantize_kernel_t dq>
+static void get_rows_cuda(const void * x, const int32_t * y, float * dst, const int nrows, const int ncols, cudaStream_t stream) {
+ const dim3 block_dims(CUDA_GET_ROWS_BLOCK_SIZE, 1, 1);
+ const int block_num_x = (ncols + 2*CUDA_GET_ROWS_BLOCK_SIZE - 1) / (2*CUDA_GET_ROWS_BLOCK_SIZE);
+ const dim3 block_nums(block_num_x, nrows, 1);
+ k_get_rows<qk, qr, dq><<<block_nums, block_dims, 0, stream>>>(x, y, dst, ncols);
+}
+
static void add_f32_cuda(const float * x, const float * y, float * dst, const int kx, const int ky, cudaStream_t stream) {
const int num_blocks = (kx + CUDA_ADD_BLOCK_SIZE - 1) / CUDA_ADD_BLOCK_SIZE;
add_f32<<<num_blocks, CUDA_ADD_BLOCK_SIZE, 0, stream>>>(x, y, dst, kx, ky);
} else if (src->backend == GGML_BACKEND_GPU || src->backend == GGML_BACKEND_GPU_SPLIT) {
GGML_ASSERT(src->backend != GGML_BACKEND_GPU_SPLIT || (i1_low == 0 && i1_high == src->ne[1]));
kind = cudaMemcpyDeviceToDevice;
- struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) src->extra;
int id;
CUDA_CHECK(cudaGetDevice(&id));
src_ptr = (char *) extra->data_device[id];
}
}
+static void ggml_cuda_op_repeat(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
+ // guaranteed to be an integer due to the check in ggml_can_repeat
+ const int64_t ne0 = dst->ne[0];
+ const int64_t ne1 = dst->ne[1];
+ const int64_t ne2 = dst->ne[2];
+ const int64_t ne3 = dst->ne[3];
+
+ const int64_t ne00 = src0->ne[0];
+ const int64_t ne01 = src0->ne[1];
+ const int64_t ne02 = src0->ne[2];
+ const int64_t ne03 = src0->ne[3];
+
+ const size_t nb0 = dst->nb[0];
+ const size_t nb1 = dst->nb[1];
+ const size_t nb2 = dst->nb[2];
+ const size_t nb3 = dst->nb[3];
+
+ const size_t nb00 = src0->nb[0];
+ const size_t nb01 = src0->nb[1];
+ const size_t nb02 = src0->nb[2];
+ const size_t nb03 = src0->nb[3];
+
+ const int nr0 = (int)(ne0/ne00);
+ const int nr1 = (int)(ne1/ne01);
+ const int nr2 = (int)(ne2/ne02);
+ const int nr3 = (int)(ne3/ne03);
+
+ // TODO: support for transposed / permuted tensors
+ GGML_ASSERT(nb0 == sizeof(float));
+ GGML_ASSERT(nb00 == sizeof(float));
+
+ // TODO: very inefficient, implement in a kernel, or fewer cudaMemcpyAsync calls for contiguous tensors
+ for (int i3 = 0; i3 < nr3; i3++) {
+ for (int k3 = 0; k3 < ne03; k3++) {
+ for (int i2 = 0; i2 < nr2; i2++) {
+ for (int k2 = 0; k2 < ne02; k2++) {
+ for (int i1 = 0; i1 < nr1; i1++) {
+ for (int k1 = 0; k1 < ne01; k1++) {
+ for (int i0 = 0; i0 < nr0; i0++) {
+ CUDA_CHECK(cudaMemcpyAsync(
+ (char *) dst_d + (i3*ne03 + k3)*nb3 + (i2*ne02 + k2)*nb2 + (i1*ne01 + k1)*nb1 + (i0*ne00)*nb0,
+ (const char *) src0_d + ( k3)*nb03 + ( k2)*nb02 + ( k1)*nb01,
+ ne00*nb0, cudaMemcpyDeviceToDevice, stream));
+ }
+ }
+ }
+ }
+ }
+ }
+ }
+
+ (void) src1;
+ (void) src1_d;
+}
+
+static void ggml_cuda_op_get_rows(
+ const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
+ const float * src0_d, const float * src1_d, float * dst_d, const cudaStream_t & stream) {
+
+ GGML_ASSERT(src1->type == GGML_TYPE_I32);
+ GGML_ASSERT(dst->type == GGML_TYPE_F32);
+ GGML_ASSERT(ggml_is_contiguous(src0));
+ GGML_ASSERT(ggml_is_contiguous(src1));
+ GGML_ASSERT(ggml_is_contiguous(dst));
+
+ const int ncols = src0->ne[0];
+ const int nrows = ggml_nelements(src1);
+
+ const int32_t * src1_i32 = (const int32_t *) src1_d;
+
+ switch (src0->type) {
+ case GGML_TYPE_F16:
+ get_rows_cuda<1, 1, convert_f16>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+ break;
+ case GGML_TYPE_F32:
+ get_rows_cuda<1, 1, convert_f32>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+ break;
+ case GGML_TYPE_Q4_0:
+ get_rows_cuda<QK4_0, QR4_0, dequantize_q4_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+ break;
+ case GGML_TYPE_Q4_1:
+ get_rows_cuda<QK4_1, QR4_1, dequantize_q4_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+ break;
+ case GGML_TYPE_Q5_0:
+ get_rows_cuda<QK5_0, QR5_0, dequantize_q5_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+ break;
+ case GGML_TYPE_Q5_1:
+ get_rows_cuda<QK5_1, QR5_1, dequantize_q5_1>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+ break;
+ case GGML_TYPE_Q8_0:
+ get_rows_cuda<QK8_0, QR8_0, dequantize_q8_0>(src0_d, src1_i32, dst_d, nrows, ncols, stream);
+ break;
+ default:
+ // TODO: k-quants
+ GGML_ASSERT(false);
+ break;
+ }
+}
+
inline void ggml_cuda_op_add(
const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst,
const float * src0_dd, const float * src1_dd, float * dst_dd, const cudaStream_t & main_stream) {
GGML_ASSERT(src1->type == GGML_TYPE_F32);
GGML_ASSERT( dst->type == GGML_TYPE_F32);
- const float scale = ((float *) src1->data)[0];
+ float scale;
+ // HACK: support for ggml backend interface
+ if (src1->backend == GGML_BACKEND_CPU) {
+ scale = ((float *) src1->data)[0];
+ } else {
+ // TODO: pass pointer to kernel instead of copying to host
+ CUDA_CHECK(cudaMemcpy(&scale, src1->data, sizeof(float), cudaMemcpyDeviceToHost));
+ }
scale_f32_cuda(src0_dd, dst_dd, scale, ggml_nelements(src0), main_stream);
CUDA_CHECK(cudaGetLastError());
GGML_ASSERT(!use_src1 || src1->backend != GGML_BACKEND_GPU_SPLIT);
GGML_ASSERT( dst->backend != GGML_BACKEND_GPU_SPLIT);
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
- struct ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+ ggml_tensor_extra_gpu * src1_extra = use_src1 ? (ggml_tensor_extra_gpu *) src1->extra : nullptr;
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
const bool src1_on_device = use_src1 && src1->backend == GGML_BACKEND_GPU;
const size_t q8_1_ts = sizeof(block_q8_1);
const size_t q8_1_bs = QK8_1;
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
const bool src0_on_device = src0->backend == GGML_BACKEND_GPU || src0->backend == GGML_BACKEND_GPU_SPLIT;
const bool src0_is_contiguous = ggml_is_contiguous(src0);
if (convert_src1_to_q8_1) {
src1_ddq[id] = (char *) ggml_cuda_pool_malloc(nrows1*src1_padded_col_size*q8_1_ts/q8_1_bs, &src1_asq[id]);
- if (split && src1_on_device && src1_is_contiguous) {
+ if (src1_on_device && src1_is_contiguous) {
quantize_row_q8_1_cuda(src1_ddf[id], src1_ddq[id], ne10, nrows1, src1_padded_col_size, stream);
CUDA_CHECK(cudaGetLastError());
}
GGML_ASSERT(false);
}
- if (convert_src1_to_q8_1 && src1->backend == GGML_BACKEND_CPU) {
+ if (convert_src1_to_q8_1 && (src1->backend == GGML_BACKEND_CPU || !src1_is_contiguous)) {
quantize_row_q8_1_cuda(src1_ddf_i, src1_ddq_i, ne10, src1_ncols, src1_padded_col_size, stream);
CUDA_CHECK(cudaGetLastError());
}
}
}
+static void ggml_cuda_repeat(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_repeat);
+}
+
+static void ggml_cuda_get_rows(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
+ ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_get_rows);
+}
+
static void ggml_cuda_add(const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
ggml_cuda_op_flatten(src0, src1, dst, ggml_cuda_op_add);
}
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
void * src0_ddq = src0_extra->data_device[g_main_device];
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
ggml_mul_mat_p021_f16_f32_cuda(src0_ddq, src1_ddf, dst_ddf, ne00, ne01, ne02, ne12, main_stream);
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
void * src0_ddq = src0_extra->data_device[g_main_device];
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
float * src1_ddf = (float *) src1_extra->data_device[g_main_device];
- struct ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
+ ggml_tensor_extra_gpu * dst_extra = (ggml_tensor_extra_gpu *) dst->extra;
float * dst_ddf = (float *) dst_extra->data_device[g_main_device];
const int64_t row_stride_x = nb01 / sizeof(half);
}
}
- if (all_on_device && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
+ if (all_on_device && src0->type == GGML_TYPE_F16 && ggml_is_permuted(src0) && ggml_is_permuted(src1) && src1->ne[1] == 1) {
ggml_cuda_mul_mat_vec_p021(src0, src1, dst);
} else if (all_on_device && !ggml_is_contiguous(src0) && ggml_is_contiguous(src1) && src1->ne[1] == 1) {
ggml_cuda_mul_mat_vec_nc(src0, src1, dst);
- }else if (src0->type == GGML_TYPE_F32) {
+ } else if (src0->type == GGML_TYPE_F32) {
ggml_cuda_op_mul_mat(src0, src1, dst, ggml_cuda_op_mul_mat_cublas, false);
} else if (ggml_is_quantized(src0->type) || src0->type == GGML_TYPE_F16) {
if (src1->ne[1] == 1 && src0->ne[0] % GGML_CUDA_DMMV_X == 0) {
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
cudaStream_t main_stream = g_cudaStreams[g_main_device][0];
- const struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
- const struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
+ const ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu *) src0->extra;
+ const ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu *) src1->extra;
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
char * src1_ddc = (char *) src1_extra->data_device[g_main_device];
const size_t nb1 = tensor->nb[1];
- ggml_backend backend = tensor->backend;
- struct ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
+ ggml_backend_type backend = tensor->backend;
+ ggml_tensor_extra_gpu * extra = new struct ggml_tensor_extra_gpu;
memset(extra, 0, sizeof(*extra));
for (int64_t id = 0; id < g_device_count; ++id) {
CUDA_CHECK(cudaMemset(buf + original_size, 0, size - original_size));
}
-
CUDA_CHECK(cudaMemcpy(buf, buf_host, original_size, cudaMemcpyHostToDevice));
extra->data_device[id] = buf;
delete extra;
}
-static struct ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
+static ggml_tensor_extra_gpu * g_temp_tensor_extras = nullptr;
static size_t g_temp_tensor_extra_index = 0;
-static struct ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+static ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
if (g_temp_tensor_extras == nullptr) {
g_temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
}
size_t alloc_index = g_temp_tensor_extra_index;
g_temp_tensor_extra_index = (g_temp_tensor_extra_index + 1) % GGML_MAX_NODES;
- struct ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
+ ggml_tensor_extra_gpu * extra = &g_temp_tensor_extras[alloc_index];
memset(extra, 0, sizeof(*extra));
return extra;
return;
}
- struct ggml_tensor_extra_gpu * extra;
+ ggml_tensor_extra_gpu * extra;
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
tensor->op == GGML_OP_VIEW ||
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
size_t offset = 0;
if (tensor->op == GGML_OP_VIEW) {
extra = ggml_cuda_alloc_temp_tensor_extra();
extra->data_device[g_main_device] = src0_ddc + offset;
} else if (tensor->op == GGML_OP_CPY) {
- struct ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
+ ggml_tensor_extra_gpu * src1_extra = (ggml_tensor_extra_gpu * ) tensor->src[1]->extra;
void * src1_ddv = src1_extra->data_device[g_main_device];
extra = ggml_cuda_alloc_temp_tensor_extra();
extra->data_device[g_main_device] = src1_ddv;
CUDA_CHECK(cudaMalloc(&g_scratch_buffer, g_scratch_size));
}
- struct ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
+ ggml_tensor_extra_gpu * extra = ggml_cuda_alloc_temp_tensor_extra();
const bool inplace = (tensor->src[0] != nullptr && tensor->src[0]->data == tensor->data) ||
tensor->op == GGML_OP_VIEW;
if (inplace && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT)) {
- struct ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
+ ggml_tensor_extra_gpu * src0_extra = (ggml_tensor_extra_gpu * ) tensor->src[0]->extra;
char * src0_ddc = (char *) src0_extra->data_device[g_main_device];
size_t view_offset = 0;
if (tensor->op == GGML_OP_VIEW) {
GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
GGML_ASSERT(ggml_is_contiguous(tensor));
- struct ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
+ ggml_tensor_extra_gpu * extra = (ggml_tensor_extra_gpu *) tensor->extra;
CUDA_CHECK(ggml_cuda_set_device(g_main_device));
CUDA_CHECK(cudaMemcpy(extra->data_device[g_main_device], tensor->data, ggml_nbytes(tensor), cudaMemcpyHostToDevice));
}
g_scratch_buffer = nullptr;
}
-bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor){
+bool ggml_cuda_compute_forward(struct ggml_compute_params * params, struct ggml_tensor * tensor) {
ggml_cuda_func_t func;
const bool any_on_device = tensor->backend == GGML_BACKEND_GPU
|| (tensor->src[0] != nullptr && (tensor->src[0]->backend == GGML_BACKEND_GPU || tensor->src[0]->backend == GGML_BACKEND_GPU_SPLIT))
|| (tensor->src[1] != nullptr && tensor->src[1]->backend == GGML_BACKEND_GPU);
+ if (!any_on_device && tensor->op != GGML_OP_MUL_MAT) {
+ return false;
+ }
+
switch (tensor->op) {
+ case GGML_OP_REPEAT:
+ func = ggml_cuda_repeat;
+ break;
+ case GGML_OP_GET_ROWS:
+ func = ggml_cuda_get_rows;
+ break;
case GGML_OP_DUP:
- if (!any_on_device) {
- return false;
- }
func = ggml_cuda_dup;
break;
case GGML_OP_ADD:
- if (!any_on_device) {
- return false;
- }
func = ggml_cuda_add;
break;
case GGML_OP_MUL:
- if (!any_on_device) {
- return false;
- }
func = ggml_cuda_mul;
break;
case GGML_OP_UNARY:
switch (ggml_get_unary_op(tensor)) {
case GGML_UNARY_OP_GELU:
- if (!any_on_device) {
- return false;
- }
func = ggml_cuda_gelu;
break;
case GGML_UNARY_OP_SILU:
- if (!any_on_device) {
- return false;
- }
func = ggml_cuda_silu;
break;
default:
return false;
} break;
case GGML_OP_NORM:
- if (!any_on_device) {
- return false;
- }
func = ggml_cuda_norm;
break;
case GGML_OP_RMS_NORM:
- if (!any_on_device) {
- return false;
- }
func = ggml_cuda_rms_norm;
break;
case GGML_OP_MUL_MAT:
func = ggml_cuda_mul_mat;
break;
case GGML_OP_SCALE:
- if (!any_on_device) {
- return false;
- }
func = ggml_cuda_scale;
break;
case GGML_OP_CPY:
- if (!any_on_device) {
- return false;
- }
func = ggml_cuda_cpy;
break;
case GGML_OP_CONT:
- if (!any_on_device) {
- return false;
- }
func = ggml_cuda_dup;
break;
case GGML_OP_RESHAPE:
case GGML_OP_VIEW:
case GGML_OP_PERMUTE:
case GGML_OP_TRANSPOSE:
- if (!any_on_device) {
- return false;
- }
func = ggml_cuda_nop;
break;
case GGML_OP_DIAG_MASK_INF:
- if (!any_on_device) {
- return false;
- }
func = ggml_cuda_diag_mask_inf;
break;
case GGML_OP_SOFT_MAX:
- if (!any_on_device) {
- return false;
- }
func = ggml_cuda_soft_max;
break;
case GGML_OP_ROPE:
- if (!any_on_device) {
- return false;
- }
func = ggml_cuda_rope;
break;
case GGML_OP_ALIBI:
- if (!any_on_device) {
- return false;
- }
func = ggml_cuda_alibi;
break;
default:
CUDA_CHECK(cudaGetDeviceProperties(&prop, device));
snprintf(description, description_size, "%s", prop.name);
}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend interface
+
+#define UNUSED GGML_UNUSED
+
+struct ggml_backend_context_cuda {
+};
+
+static const char * ggml_backend_cuda_name(ggml_backend_t backend) {
+ return GGML_CUDA_NAME;
+
+ UNUSED(backend);
+}
+
+static void ggml_backend_cuda_free(ggml_backend_t backend) {
+ ggml_backend_context_cuda * cuda_ctx = (ggml_backend_context_cuda *)backend->context;
+ delete cuda_ctx;
+ delete backend;
+}
+
+struct ggml_backend_buffer_context_cuda {
+ void * device;
+
+ ggml_tensor_extra_gpu * temp_tensor_extras = nullptr;
+ size_t temp_tensor_extra_index = 0;
+
+ ~ggml_backend_buffer_context_cuda() {
+ delete[] temp_tensor_extras;
+ }
+
+ ggml_tensor_extra_gpu * ggml_cuda_alloc_temp_tensor_extra() {
+ if (temp_tensor_extras == nullptr) {
+ temp_tensor_extras = new ggml_tensor_extra_gpu[GGML_MAX_NODES];
+ }
+
+ size_t alloc_index = temp_tensor_extra_index;
+ temp_tensor_extra_index = (temp_tensor_extra_index + 1) % GGML_MAX_NODES;
+ ggml_tensor_extra_gpu * extra = &temp_tensor_extras[alloc_index];
+ memset(extra, 0, sizeof(*extra));
+
+ return extra;
+ }
+};
+
+static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+ CUDA_CHECK(cudaFree(ctx->device));
+ delete ctx;
+}
+
+static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+ return ctx->device;
+}
+
+static size_t ggml_backend_cuda_buffer_get_alloc_size(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+ int64_t row_low = 0;
+ int64_t row_high = ggml_nrows(tensor);
+ int64_t nrows_split = row_high - row_low;
+
+ size_t size = ggml_nbytes_split(tensor, nrows_split);
+
+ int64_t ne0 = tensor->ne[0];
+
+ if (ggml_is_quantized(tensor->type)) {
+ if (ne0 % MATRIX_ROW_PADDING != 0) {
+ size += (MATRIX_ROW_PADDING - ne0 % MATRIX_ROW_PADDING)
+ * ggml_type_size(tensor->type)/ggml_blck_size(tensor->type);
+ }
+ }
+
+ return size;
+
+ UNUSED(buffer);
+}
+
+static void ggml_backend_cuda_buffer_init_tensor(ggml_backend_buffer_t buffer, ggml_tensor * tensor) {
+ ggml_backend_buffer_context_cuda * ctx = (ggml_backend_buffer_context_cuda *)buffer->context;
+
+ if (tensor->view_src != NULL && tensor->view_offs == 0) {
+ assert(tensor->view_src->buffer->backend == buffer->backend);
+ tensor->backend = tensor->view_src->backend;
+ tensor->extra = tensor->view_src->extra;
+ return;
+ }
+
+ ggml_tensor_extra_gpu * extra = ctx->ggml_cuda_alloc_temp_tensor_extra();
+
+ extra->data_device[g_main_device] = tensor->data;
+
+ tensor->backend = GGML_BACKEND_GPU;
+ tensor->extra = extra;
+
+ if (ggml_is_quantized(tensor->type)) {
+ // initialize padding to 0 to avoid possible NaN values
+ int64_t row_low = 0;
+ int64_t row_high = ggml_nrows(tensor);
+ int64_t nrows_split = row_high - row_low;
+
+ size_t original_size = ggml_nbytes_split(tensor, nrows_split);
+ size_t padded_size = ggml_backend_cuda_buffer_get_alloc_size(tensor->buffer, tensor);
+
+ if (padded_size > original_size && tensor->view_src == nullptr) {
+ CUDA_CHECK(cudaMemsetAsync((char *)tensor->data + original_size, 0, padded_size - original_size, g_cudaStreams[g_main_device][0]));
+ }
+ }
+
+ UNUSED(buffer);
+}
+
+static struct ggml_backend_buffer_i cuda_backend_buffer_interface = {
+ /* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
+ /* .get_base = */ ggml_backend_cuda_buffer_get_base,
+ /* .get_alloc_size = */ ggml_backend_cuda_buffer_get_alloc_size,
+ /* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
+ /* .free_tensor = */ NULL,
+};
+
+static ggml_backend_buffer_t ggml_backend_cuda_alloc_buffer(ggml_backend_t backend, size_t size) {
+ ggml_backend_buffer_context_cuda * ctx = new ggml_backend_buffer_context_cuda;
+ CUDA_CHECK(cudaMalloc(&ctx->device, size));
+ return ggml_backend_buffer_init(backend, cuda_backend_buffer_interface, ctx, size);
+}
+
+static size_t ggml_backend_cuda_get_alignment(ggml_backend_t backend) {
+ return 128;
+ UNUSED(backend);
+}
+
+static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+ CUDA_CHECK(cudaMemcpyAsync((char *)tensor->data + offset, data, size, cudaMemcpyHostToDevice, g_cudaStreams[g_main_device][0]));
+
+ UNUSED(backend);
+}
+
+static void ggml_backend_cuda_get_tensor_async(ggml_backend_t backend, const ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+ GGML_ASSERT(tensor->backend == GGML_BACKEND_GPU);
+
+ CUDA_CHECK(cudaMemcpyAsync(data, (const char *)tensor->data + offset, size, cudaMemcpyDeviceToHost, g_cudaStreams[g_main_device][0]));
+
+ UNUSED(backend);
+}
+
+static void ggml_backend_cuda_synchronize(ggml_backend_t backend) {
+ CUDA_CHECK(cudaStreamSynchronize(g_cudaStreams[g_main_device][0]));
+
+ UNUSED(backend);
+}
+
+static ggml_backend_graph_plan_t ggml_backend_cuda_graph_plan_create(ggml_backend_t backend, ggml_cgraph * cgraph) {
+ GGML_ASSERT(!"not implemented");
+
+ return nullptr;
+
+ UNUSED(backend);
+ UNUSED(cgraph);
+}
+
+static void ggml_backend_cuda_graph_plan_free(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+ GGML_ASSERT(!"not implemented");
+
+ UNUSED(backend);
+ UNUSED(plan);
+}
+
+static void ggml_backend_cuda_graph_plan_compute(ggml_backend_t backend, ggml_backend_graph_plan_t plan) {
+ GGML_ASSERT(!"not implemented");
+
+ UNUSED(backend);
+ UNUSED(plan);
+}
+
+static void ggml_backend_cuda_graph_compute(ggml_backend_t backend, ggml_cgraph * cgraph) {
+ ggml_compute_params params = {};
+ params.type = GGML_TASK_COMPUTE;
+ params.ith = 0;
+ for (int i = 0; i < cgraph->n_nodes; i++) {
+ ggml_tensor * node = cgraph->nodes[i];
+
+ assert(node->backend == GGML_BACKEND_GPU);
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ if (node->src[j] != nullptr) {
+ assert(node->src[j]->backend == GGML_BACKEND_GPU);
+ }
+ }
+
+ bool ok = ggml_cuda_compute_forward(¶ms, node);
+ if (!ok) {
+ fprintf(stderr, "%s: error: op not supported %s (%s)\n", __func__, node->name, ggml_op_name(node->op));
+ }
+ GGML_ASSERT(ok);
+
+#if 0
+ if (node->type == GGML_TYPE_F32) {
+ cudaDeviceSynchronize();
+ std::vector<float> tmp(ggml_nelements(node), 0.0f);
+ cudaMemcpy(tmp.data(), node->data, ggml_nelements(node)*sizeof(float), cudaMemcpyDeviceToHost);
+ printf("\n%s (%s) (%s %s) (%s %s): ", node->name, ggml_op_name(node->op),
+ ggml_type_name(node->src[0]->type),
+ node->src[1] ? ggml_type_name(node->src[1]->type) : "none",
+ node->src[0]->name,
+ node->src[1] ? node->src[1]->name : "none");
+ double sum = 0.0;
+ double sq_sum = 0.0;
+ for (int i = 0; i < ggml_nelements(node); i++) {
+ printf("%f ", tmp[i]);
+ sum += tmp[i];
+ sq_sum += tmp[i]*tmp[i];
+ }
+ printf("\n");
+ printf("sum: %f, ", sum);
+ printf("sq_sum: %f\n", sq_sum);
+ }
+#endif
+ }
+
+ UNUSED(backend);
+}
+
+static ggml_backend_i cuda_backend_i = {
+ /* .get_name = */ ggml_backend_cuda_name,
+ /* .free = */ ggml_backend_cuda_free,
+ /* .alloc_buffer = */ ggml_backend_cuda_alloc_buffer,
+ /* .get_alignment = */ ggml_backend_cuda_get_alignment,
+ /* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
+ /* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
+ /* .synchronize = */ ggml_backend_cuda_synchronize,
+ /* .cpy_tensor_from = */ nullptr,
+ /* .cpy_tensor_to = */ nullptr,
+ /* .graph_plan_create = */ ggml_backend_cuda_graph_plan_create,
+ /* .graph_plan_free = */ ggml_backend_cuda_graph_plan_free,
+ /* .graph_plan_compute = */ ggml_backend_cuda_graph_plan_compute,
+ /* .graph_compute = */ ggml_backend_cuda_graph_compute,
+ /* .supports_op = */ nullptr,
+};
+
+ggml_backend_t ggml_backend_cuda_init() {
+ ggml_init_cublas(); // TODO: remove from ggml.c
+
+ ggml_backend_context_cuda * ctx = new ggml_backend_context_cuda;
+
+ ggml_backend_t cuda_backend = new ggml_backend;
+ *cuda_backend = (ggml_backend){
+ /* .interface = */ cuda_backend_i,
+ /* .context = */ ctx
+ };
+
+ return cuda_backend;
+}
#pragma once
#include "ggml.h"
+#include "ggml-backend.h"
#ifdef GGML_USE_HIPBLAS
#define GGML_CUDA_NAME "ROCm"
GGML_API int ggml_cuda_get_device_count(void);
GGML_API void ggml_cuda_get_device_description(int device, char * description, size_t description_size);
+// backend API
+GGML_API ggml_backend_t ggml_backend_cuda_init(void); // TODO: take a list of devices to use
+
#ifdef __cplusplus
}
#endif
#pragma once
#include "ggml.h"
+#include "ggml-backend.h"
#include <stddef.h>
#include <stdbool.h>
extern "C" {
#endif
-void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
+//
+// internal API
+// temporary exposed to user-code
+//
struct ggml_metal_context;
+void ggml_metal_log_set_callback(ggml_log_callback log_callback, void * user_data);
+
// number of command buffers to use
struct ggml_metal_context * ggml_metal_init(int n_cb);
void ggml_metal_free(struct ggml_metal_context * ctx);
// creates gf->n_threads command buffers in parallel
void ggml_metal_graph_compute(struct ggml_metal_context * ctx, struct ggml_cgraph * gf);
+//
+// backend API
+// user-code should use only these functions
+//
+
+GGML_API ggml_backend_t ggml_backend_metal_init(void);
+
+GGML_API bool ggml_backend_is_metal(ggml_backend_t backend);
+
+GGML_API void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb);
+
#ifdef __cplusplus
}
#endif
}
}
-
-
struct ggml_metal_context * ggml_metal_init(int n_cb) {
GGML_METAL_LOG_INFO("%s: allocating\n", __func__);
}
}
+
+////////////////////////////////////////////////////////////////////////////////
+
+// backend interface
+
+static const char * ggml_backend_metal_name(ggml_backend_t backend) {
+ return "Metal";
+
+ UNUSED(backend);
+}
+
+static void ggml_backend_metal_free(ggml_backend_t backend) {
+ struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+ ggml_metal_free(ctx);
+ free(backend);
+}
+
+static void * ggml_backend_metal_buffer_get_base(ggml_backend_buffer_t buffer) {
+ return (void *)buffer->context;
+}
+
+static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+ free(buffer->context);
+ UNUSED(buffer);
+}
+
+static struct ggml_backend_buffer_i metal_backend_buffer_i = {
+ /* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
+ /* .get_base = */ ggml_backend_metal_buffer_get_base,
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
+ /* .init_tensor = */ NULL, // no initialization required
+ /* .free_tensor = */ NULL, // no cleanup required
+};
+
+static ggml_backend_buffer_t ggml_backend_metal_alloc_buffer(ggml_backend_t backend, size_t size) {
+ struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+
+ void * data = ggml_metal_host_malloc(size);
+
+ // TODO: set proper name of the buffers
+ ggml_metal_add_buffer(ctx, "backend", data, size, 0);
+
+ return ggml_backend_buffer_init(backend, metal_backend_buffer_i, data, size);
+}
+
+static size_t ggml_backend_metal_get_alignment(ggml_backend_t backend) {
+ return 32;
+ UNUSED(backend);
+}
+
+static void ggml_backend_metal_set_tensor_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+ memcpy((char *)tensor->data + offset, data, size);
+
+ UNUSED(backend);
+}
+
+static void ggml_backend_metal_get_tensor_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+
+ memcpy(data, (const char *)tensor->data + offset, size);
+
+ UNUSED(backend);
+}
+
+static void ggml_backend_metal_synchronize(ggml_backend_t backend) {
+ UNUSED(backend);
+}
+
+static void ggml_backend_metal_cpy_tensor_from(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+ ggml_backend_tensor_get(src, dst->data, 0, ggml_nbytes(src));
+
+ UNUSED(backend);
+}
+
+static void ggml_backend_metal_cpy_tensor_to(ggml_backend_t backend, struct ggml_tensor * src, struct ggml_tensor * dst) {
+ ggml_backend_tensor_set_async(dst, src->data, 0, ggml_nbytes(src));
+
+ UNUSED(backend);
+}
+
+static void ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
+ struct ggml_metal_context * metal_ctx = (struct ggml_metal_context *)backend->context;
+
+ ggml_metal_graph_compute(metal_ctx, cgraph);
+}
+
+static bool ggml_backend_metal_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
+ return true;
+ UNUSED(backend);
+ UNUSED(op);
+}
+
+static struct ggml_backend_i metal_backend_i = {
+ /* .get_name = */ ggml_backend_metal_name,
+ /* .free = */ ggml_backend_metal_free,
+ /* .alloc_buffer = */ ggml_backend_metal_alloc_buffer,
+ /* .get_alignment = */ ggml_backend_metal_get_alignment,
+ /* .set_tensor_async = */ ggml_backend_metal_set_tensor_async,
+ /* .get_tensor_async = */ ggml_backend_metal_get_tensor_async,
+ /* .synchronize = */ ggml_backend_metal_synchronize,
+ /* .cpy_tensor_from = */ ggml_backend_metal_cpy_tensor_from,
+ /* .cpy_tensor_to = */ ggml_backend_metal_cpy_tensor_to,
+ /* .graph_plan_create = */ NULL, // the metal implementation does not require creating graph plans atm
+ /* .graph_plan_free = */ NULL,
+ /* .graph_plan_compute = */ NULL,
+ /* .graph_compute = */ ggml_backend_metal_graph_compute,
+ /* .supports_op = */ ggml_backend_metal_supports_op,
+};
+
+ggml_backend_t ggml_backend_metal_init(void) {
+ struct ggml_metal_context * ctx = malloc(sizeof(struct ggml_metal_context));
+
+ ctx = ggml_metal_init(GGML_DEFAULT_N_THREADS);
+
+ ggml_backend_t metal_backend = malloc(sizeof(struct ggml_backend));
+
+ *metal_backend = (struct ggml_backend) {
+ /* .interface = */ metal_backend_i,
+ /* .context = */ ctx,
+ };
+
+ return metal_backend;
+}
+
+bool ggml_backend_is_metal(ggml_backend_t backend) {
+ return backend->interface.get_name == ggml_backend_metal_name;
+}
+
+void ggml_backend_metal_set_n_cb(ggml_backend_t backend, int n_cb) {
+ struct ggml_metal_context * ctx = (struct ggml_metal_context *)backend->context;
+
+ ggml_metal_set_n_cb(ctx, n_cb);
+}
*result = (struct ggml_tensor) {
/*.type =*/ type,
/*.backend =*/ GGML_BACKEND_CPU,
+ /*.buffer =*/ NULL,
/*.n_dims =*/ n_dims,
/*.ne =*/ { 1, 1, 1, 1 },
/*.nb =*/ { 0, 0, 0, 0 },