//
enum ggml_backend_dev_type {
+ // CPU device using system memory
GGML_BACKEND_DEVICE_TYPE_CPU,
+ // GPU device using dedicated memory
GGML_BACKEND_DEVICE_TYPE_GPU,
- // devices with full capabilities (excludes backends such as BLAS that only support matrix multiplication)
- GGML_BACKEND_DEVICE_TYPE_CPU_FULL,
- GGML_BACKEND_DEVICE_TYPE_GPU_FULL
+ // accelerator devices intended to be used together with the CPU backend (e.g. BLAS or AMX)
+ GGML_BACKEND_DEVICE_TYPE_ACCEL
};
// functionality supported by the device
GGML_API ggml_backend_dev_t ggml_backend_reg_dev_get(ggml_backend_reg_t reg, size_t index);
GGML_API void * ggml_backend_reg_get_proc_address(ggml_backend_reg_t reg, const char * name);
+ // Common functions that may be obtained using ggml_backend_reg_get_proc_address
- // Functions that may be obtained using ggml_backend_reg_get_proc_address
- typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(const float *);
- typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t, int);
+ // Split buffer type for tensor parallelism
+ typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
+ // Set the number of threads for the backend
+ typedef void (*ggml_backend_set_n_threads_t)(ggml_backend_t backend, int n_threads);
+ // Get additional buffer types provided by the device (returns a NULL-terminated array)
+ typedef ggml_backend_buffer_type_t * (*ggml_backend_dev_get_extra_bufts_t)(ggml_backend_dev_t device);
//
// Backend registry
GGML_API ggml_backend_t ggml_backend_init_by_name(const char * name, const char * params);
// = ggml_backend_dev_init(ggml_backend_dev_by_type(type), params)
GGML_API ggml_backend_t ggml_backend_init_by_type(enum ggml_backend_dev_type type, const char * params);
- // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU_FULL) OR ggml_backend_dev_by_type(CPU_FULL), NULL)
+ // = ggml_backend_dev_init(ggml_backend_dev_by_type(GPU) OR ggml_backend_dev_by_type(CPU), NULL)
GGML_API ggml_backend_t ggml_backend_init_best(void);
//
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_buffer_type(int device);
// split tensor buffer that splits matrices by rows across multiple devices
-GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split);
+GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split);
// pinned host buffer for use with the CPU backend for faster copies between CPU and GPU
GGML_API ggml_backend_buffer_type_t ggml_backend_cuda_host_buffer_type(void);
#if defined(__AMX_INT8__)
// AMX buffer interface
-static const char * ggml_backend_amx_buffer_get_name(ggml_backend_buffer_t buffer) {
- return "AMX";
-
- GGML_UNUSED(buffer);
-}
-
static void ggml_backend_amx_buffer_free_buffer(ggml_backend_buffer_t buffer) {
free(buffer->context);
}
}
static ggml_backend_buffer_i ggml_backend_amx_buffer_interface = {
- /* .get_name = */ ggml_backend_amx_buffer_get_name,
/* .free_buffer = */ ggml_backend_amx_buffer_free_buffer,
/* .get_base = */ ggml_backend_amx_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
ggml_backend_buffer_type_t ggml_backend_amx_buffer_type() {
static struct ggml_backend_buffer_type ggml_backend_buffer_type_amx = {
/* .iface = */ {
- /* .get_name = */ ggml_backend_amx_buffer_type_get_name,
- /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
- /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
- /* .get_max_size = */ NULL, // defaults to SIZE_MAX
- /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
- /* .is_host = */ ggml_backend_amx_buffer_type_is_host,
+ /* .get_name = */ ggml_backend_amx_buffer_type_get_name,
+ /* .alloc_buffer = */ ggml_backend_amx_buffer_type_alloc_buffer,
+ /* .get_alignment = */ ggml_backend_amx_buffer_type_get_alignment,
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
+ /* .get_alloc_size = */ ggml_backend_amx_buffer_type_get_alloc_size,
+ /* .is_host = */ ggml_backend_amx_buffer_type_is_host,
},
- /* .device = */ NULL,
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_amx_reg(), 0),
/* .context = */ NULL,
};
delete backend;
}
-static ggml_backend_buffer_type_t ggml_backend_amx_get_default_buffer_type(ggml_backend_t backend) {
- return ggml_backend_amx_buffer_type();
-
- GGML_UNUSED(backend);
-}
-
static enum ggml_status ggml_backend_amx_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
ggml_backend_amx_context * ctx = (ggml_backend_amx_context *)backend->context;
static struct ggml_backend_i ggml_backend_amx_i = {
/* .get_name = */ ggml_backend_amx_name,
/* .free = */ ggml_backend_amx_free,
- /* .get_default_buffer_type = */ ggml_backend_amx_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_amx_graph_compute,
- /* .supports_op = */ NULL,
- /* .supports_buft = */ NULL,
- /* .offload_op = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
}
static enum ggml_backend_dev_type ggml_backend_amx_device_get_type(ggml_backend_dev_t dev) {
- return GGML_BACKEND_DEVICE_TYPE_CPU;
+ return GGML_BACKEND_DEVICE_TYPE_ACCEL;
GGML_UNUSED(dev);
}
size_t (*get_max_size) (ggml_backend_buffer_type_t buft);
// (optional) data size needed to allocate the tensor, including padding (defaults to ggml_nbytes)
size_t (*get_alloc_size)(ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor);
- // (optional) check if tensor data is in host memory (defaults to false)
+ // (optional) check if tensor data is in host memory and uses standard ggml tensor layout (defaults to false)
bool (*is_host) (ggml_backend_buffer_type_t buft);
};
//
struct ggml_backend_buffer_i {
- const char * (*get_name) (ggml_backend_buffer_t buffer);
// (optional) free the buffer
void (*free_buffer) (ggml_backend_buffer_t buffer);
// base address of the buffer
void (*free)(ggml_backend_t backend);
- // Will be moved to the device interface
- // buffer allocation
- ggml_backend_buffer_type_t (*get_default_buffer_type)(ggml_backend_t backend);
-
// (optional) asynchronous tensor data access
void (*set_tensor_async)(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
void (*get_tensor_async)(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
bool (*cpy_tensor_async)(ggml_backend_t backend_src, ggml_backend_t backend_dst, const struct ggml_tensor * src, struct ggml_tensor * dst);
- // (optional) complete all pending operations
+ // (optional) complete all pending operations (required if the backend supports async operations)
void (*synchronize)(ggml_backend_t backend);
- // (optional) compute graph with a plan (not used currently)
+ // (optional) graph plans (not used currently)
+ // compute graph with a plan
ggml_backend_graph_plan_t (*graph_plan_create) (ggml_backend_t backend, const struct ggml_cgraph * cgraph);
void (*graph_plan_free) (ggml_backend_t backend, ggml_backend_graph_plan_t plan);
// update the plan with a new graph - this should be faster than creating a new plan when the graph has the same topology
// compute graph (always async if supported by the backend)
enum ggml_status (*graph_compute) (ggml_backend_t backend, struct ggml_cgraph * cgraph);
- // IMPORTANT: these functions have been moved to the device interface and will be removed from the backend interface
- // new backends should implement the device interface instead
- // These functions are being moved to the device interface
- bool (*supports_op) (ggml_backend_t backend, const struct ggml_tensor * op);
- bool (*supports_buft)(ggml_backend_t backend, ggml_backend_buffer_type_t buft);
- bool (*offload_op) (ggml_backend_t backend, const struct ggml_tensor * op);
-
// (optional) event synchronization
// record an event on this stream
void (*event_record)(ggml_backend_t backend, ggml_backend_event_t event);
}
ggml_backend_buffer_t ggml_backend_buft_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
+ if (size == 0) {
+ // return a dummy buffer for zero-sized allocations
+ return ggml_backend_buffer_init(buft, {}, NULL, 0);
+ }
+
return buft->iface.alloc_buffer(buft, size);
}
}
const char * ggml_backend_buffer_name(ggml_backend_buffer_t buffer) {
- return buffer->iface.get_name(buffer);
+ return ggml_backend_buft_name(ggml_backend_buffer_get_type(buffer));
}
void ggml_backend_buffer_free(ggml_backend_buffer_t buffer) {
}
void * ggml_backend_buffer_get_base(ggml_backend_buffer_t buffer) {
+ // get_base is optional if the buffer is zero-sized
+ if (buffer->size == 0) {
+ return NULL;
+ }
+
void * base = buffer->iface.get_base(buffer);
GGML_ASSERT(base != NULL && "backend buffer base cannot be NULL");
}
}
+void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+ // clear is optional if the buffer is zero-sized
+ if (buffer->size == 0) {
+ return;
+ }
+
+ buffer->iface.clear(buffer, value);
+}
+
size_t ggml_backend_buffer_get_alignment(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
}
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
}
-void ggml_backend_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
- buffer->iface.clear(buffer, value);
-}
-
bool ggml_backend_buffer_is_host(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_is_host(ggml_backend_buffer_get_type(buffer));
}
}
ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend) {
- return backend->iface.get_default_buffer_type(backend);
+ return ggml_backend_dev_buffer_type(backend->device);
}
ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size) {
void ggml_backend_tensor_set(struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+ if (size == 0) {
+ return;
+ }
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
- if (!size) {
- return;
- }
-
buf->iface.set_tensor(buf, tensor, data, offset, size);
}
void ggml_backend_tensor_get(const struct ggml_tensor * tensor, void * data, size_t offset, size_t size) {
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
+ if (size == 0) {
+ return;
+ }
+
GGML_ASSERT(buf != NULL && "tensor buffer not set");
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor read out of bounds");
- if (!size) {
- return;
- }
-
buf->iface.get_tensor(buf, tensor, data, offset, size);
}
GGML_API void ggml_backend_tensor_memset(struct ggml_tensor * tensor, uint8_t value, size_t offset, size_t size) {
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
- GGML_ASSERT(buf != NULL && "tensor buffer not set");
- GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
- GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
-
- if (!size) {
+ if (size == 0) {
return;
}
- GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not supported by backend buffer");
+ GGML_ASSERT(buf != NULL && "tensor buffer not set");
+ GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
+ GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
+ GGML_ASSERT(buf->iface.memset_tensor != NULL && "memset not implemented by backend buffer");
buf->iface.memset_tensor(buf, tensor, value, offset, size);
}
}
bool ggml_backend_supports_op(ggml_backend_t backend, const struct ggml_tensor * op) {
- // helper to ease transition to device interface
- if (backend->device) {
- return ggml_backend_dev_supports_op(backend->device, op);
- }
-
- return backend->iface.supports_op(backend, op);
+ return ggml_backend_dev_supports_op(backend->device, op);
}
bool ggml_backend_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
- // helper to ease transition to device interface
- if (backend->device) {
- return ggml_backend_dev_supports_buft(backend->device, buft);
- }
- return backend->iface.supports_buft(backend, buft);
+ return ggml_backend_dev_supports_buft(backend->device, buft);
}
bool ggml_backend_offload_op(ggml_backend_t backend, const struct ggml_tensor * op) {
- // helper to ease transition to device interface
- if (backend->device) {
- return ggml_backend_dev_offload_op(backend->device, op);
- }
-
- if (backend->iface.offload_op != NULL) {
- return backend->iface.offload_op(backend, op);
- }
- return false;
+ return ggml_backend_dev_offload_op(backend->device, op);
}
ggml_backend_dev_t ggml_backend_get_device(ggml_backend_t backend) {
#ifdef GGML_USE_VULKAN
register_backend(ggml_backend_vk_reg());
#endif
+#ifdef GGML_USE_CANN
+ register_backend(ggml_backend_cann_reg());
+#endif
#ifdef GGML_USE_BLAS
register_backend(ggml_backend_blas_reg());
#endif
#ifdef GGML_USE_AMX
register_backend(ggml_backend_amx_reg());
#endif
-#ifdef GGML_USE_CANN
- register_backend(ggml_backend_cann_reg());
-#endif
// TODO: kompute
}
ggml_backend_t ggml_backend_init_best(void) {
- ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL);
+ ggml_backend_dev_t dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU);
if (!dev) {
- dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU_FULL);
+ dev = ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_CPU);
}
if (!dev) {
return NULL;
return ggml_backend_dev_init(dev, NULL);
}
-// backend CPU
-
-static const char * ggml_backend_cpu_buffer_get_name(ggml_backend_buffer_t buffer) {
- return "CPU";
-
- GGML_UNUSED(buffer);
-}
+// CPU backend - buffer
static void * ggml_backend_cpu_buffer_get_base(ggml_backend_buffer_t buffer) {
uintptr_t data = (uintptr_t)buffer->context;
}
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_i = {
- /* .get_name = */ ggml_backend_cpu_buffer_get_name,
/* .free_buffer = */ ggml_backend_cpu_buffer_free_buffer,
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
};
static const struct ggml_backend_buffer_i ggml_backend_cpu_buffer_from_ptr_i = {
- /* .get_name = */ ggml_backend_cpu_buffer_get_name,
/* .free_buffer = */ NULL, // ptr is not owned by the buffer, so it does not need to be freed
/* .get_base = */ ggml_backend_cpu_buffer_get_base,
/* .init_tensor = */ NULL, // no initialization required
/* .reset = */ NULL,
};
+// CPU backend - buffer type
+
static const char * ggml_backend_cpu_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
return "CPU";
}
static ggml_backend_buffer_t ggml_backend_cpu_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
- auto alloc_size = size;
- if (alloc_size == 0) {
- alloc_size = 1;
- }
-
- void * data = ggml_aligned_malloc(alloc_size);
+ void * data = ggml_aligned_malloc(size);
if (data == NULL) {
- GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, alloc_size);
+ GGML_LOG_ERROR("%s: failed to allocate buffer of size %zu\n", __func__, size);
return NULL;
}
- return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, alloc_size);
+ return ggml_backend_buffer_init(buft, ggml_backend_cpu_buffer_i, data, size);
}
static size_t ggml_backend_cpu_buffer_type_get_alignment(ggml_backend_buffer_type_t buft) {
return &ggml_backend_cpu_buffer_type;
}
+static const char * ggml_backend_cpu_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
+ return "CPU_Mapped";
+
+ GGML_UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_cpu_buffer_from_ptr_type(void) {
+ static struct ggml_backend_buffer_type ggml_backend_cpu_buffer_type = {
+ /* .iface = */ {
+ /* .get_name = */ ggml_backend_cpu_buffer_from_ptr_type_get_name,
+ /* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
+ /* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
+ /* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
+ },
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cpu_reg(), 0),
+ /* .context = */ NULL,
+ };
+
+ return &ggml_backend_cpu_buffer_type;
+}
+
#ifdef GGML_USE_CPU_HBM
// buffer type HBM
GGML_UNUSED(buft);
}
-static const char * ggml_backend_cpu_hbm_buffer_get_name(ggml_backend_buffer_t buf) {
- return "CPU_HBM";
-
- GGML_UNUSED(buf);
-}
-
static void ggml_backend_cpu_hbm_buffer_free_buffer(ggml_backend_buffer_t buffer) {
hbw_free(buffer->context);
}
static ggml_backend_buffer_t ggml_backend_cpu_hbm_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
- //void * ptr = hbw_malloc(size);
void * ptr;
int result = hbw_posix_memalign(&ptr, ggml_backend_cpu_buffer_type_get_alignment(buft), size);
if (result != 0) {
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft;
- buffer->iface.get_name = ggml_backend_cpu_hbm_buffer_get_name;
buffer->iface.free_buffer = ggml_backend_cpu_hbm_buffer_free_buffer;
return buffer;
}
#endif
+static ggml_backend_buffer_type_t * ggml_backend_cpu_get_extra_bufts(ggml_backend_dev_t device) {
+ static ggml_backend_buffer_type_t bufts[] = {
+#ifdef GGML_USE_CPU_HBM
+ ggml_backend_cpu_hbm_buffer_type(),
+#endif
+ NULL
+ };
+
+ return bufts;
+
+ GGML_UNUSED(device);
+}
+
+// CPU backend - backend (stream)
+
struct ggml_backend_cpu_context {
int n_threads;
ggml_threadpool_t threadpool;
delete backend;
}
-static ggml_backend_buffer_type_t ggml_backend_cpu_get_default_buffer_type(ggml_backend_t backend) {
- return ggml_backend_cpu_buffer_type();
-
- GGML_UNUSED(backend);
-}
-
struct ggml_backend_plan_cpu {
struct ggml_cplan cplan;
struct ggml_cgraph cgraph;
static const struct ggml_backend_i ggml_backend_cpu_i = {
/* .get_name = */ ggml_backend_cpu_get_name,
/* .free = */ ggml_backend_cpu_free,
- /* .get_default_buffer_type = */ ggml_backend_cpu_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ ggml_backend_cpu_graph_plan_compute,
/* .graph_compute = */ ggml_backend_cpu_graph_compute,
- /* .supports_op = */ NULL,
- /* .supports_buft = */ NULL,
- /* .offload_op = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
ggml_backend_buffer_t ggml_backend_cpu_buffer_from_ptr(void * ptr, size_t size) {
GGML_ASSERT((uintptr_t)ptr % TENSOR_ALIGNMENT == 0 && "buffer pointer must be aligned");
- return ggml_backend_buffer_init(ggml_backend_cpu_buffer_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
+ return ggml_backend_buffer_init(ggml_backend_cpu_buffer_from_ptr_type(), ggml_backend_cpu_buffer_from_ptr_i, ptr, size);
}
-////////////////////////
+// CPU backend - device
struct ggml_backend_cpu_device_context {
std::string description = "CPU";
}
static enum ggml_backend_dev_type ggml_backend_cpu_device_get_type(ggml_backend_dev_t dev) {
- return GGML_BACKEND_DEVICE_TYPE_CPU_FULL;
+ return GGML_BACKEND_DEVICE_TYPE_CPU;
GGML_UNUSED(dev);
}
};
}
-static ggml_backend_t ggml_backend_cpu_device_init(ggml_backend_dev_t dev, const char * params) {
+static ggml_backend_t ggml_backend_cpu_device_init_backend(ggml_backend_dev_t dev, const char * params) {
return ggml_backend_cpu_init();
GGML_UNUSED(dev);
GGML_UNUSED(dev);
}
-static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+static ggml_backend_buffer_t ggml_backend_cpu_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
GGML_UNUSED(dev);
/* .get_memory = */ ggml_backend_cpu_device_get_memory,
/* .get_type = */ ggml_backend_cpu_device_get_type,
/* .get_props = */ ggml_backend_cpu_device_get_props,
- /* .init_backend = */ ggml_backend_cpu_device_init,
+ /* .init_backend = */ ggml_backend_cpu_device_init_backend,
/* .get_buffer_type = */ ggml_backend_cpu_device_get_buffer_type,
/* .get_host_buffer_type = */ NULL,
- /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_ptr,
+ /* .buffer_from_host_ptr = */ ggml_backend_cpu_device_buffer_from_host_ptr,
/* .supports_op = */ ggml_backend_cpu_device_supports_op,
/* .supports_buft = */ ggml_backend_cpu_device_supports_buft,
/* .offload_op = */ NULL,
/* .event_synchronize = */ NULL,
};
-////////////////////////
+// CPU backend - backend (reg)
static const char * ggml_backend_cpu_reg_get_name(ggml_backend_reg_t reg) {
return "CPU";
if (strcmp(name, "ggml_backend_set_n_threads") == 0) {
return (void *)ggml_backend_cpu_set_n_threads;
}
+ if (strcmp(name, "ggml_backend_dev_get_extra_bufts") == 0) {
+ return (void *)ggml_backend_cpu_get_extra_bufts;
+ }
+
return NULL;
GGML_UNUSED(reg);
size_t n_buffers;
};
-static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
- ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
-
- return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
-}
-
static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_multi_buffer_context * ctx = (ggml_backend_multi_buffer_context *) buffer->context;
for (size_t i = 0; i < ctx->n_buffers; i++) {
}
static const struct ggml_backend_buffer_i ggml_backend_multi_buffer_i = {
- /* .get_name = */ ggml_backend_multi_buffer_get_name,
/* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
/* .get_base = */ NULL,
/* .init_tensor = */ NULL,
}
bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
- return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
+ return buffer->iface.free_buffer == ggml_backend_multi_buffer_free_buffer;
}
void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
char * context_buffer;
size_t context_buffer_size;
- bool debug;
+ int debug;
};
#define hash_id(tensor) ggml_hash_find_or_insert(&sched->hash_set, tensor)
return -1;
}
-#if 0
+#if 1
#define GGML_SCHED_MAX_SPLITS_DEBUG 4096
static char causes[GGML_DEFAULT_GRAPH_SIZE*16 + GGML_SCHED_MAX_SPLITS_DEBUG*GGML_SCHED_MAX_SPLIT_INPUTS][128]; // debug only
#define SET_CAUSE(node, ...) sprintf(causes[hash_id(node)], __VA_ARGS__)
if (src == NULL) {
continue;
}
- if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
+ // skip ROPE since the rope freqs tensor is too small to choose a backend based on it
+ // not an ideal solution
+ if (tensor->op != GGML_OP_ROPE && src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = ggml_backend_sched_backend_from_buffer(sched, src, tensor);
// check if a backend with higher prio wants to offload the op
if (src_backend_id == sched->n_backends - 1) {
if (ggml_is_view_op(node->op)) {
continue;
}
- ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
- GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
- fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
- for (int j = 0; j < GGML_MAX_SRC; j++) {
- struct ggml_tensor * src = node->src[j];
- if (src == NULL) {
- continue;
+ if (sched->debug > 1) {
+ ggml_backend_t tensor_backend = ggml_backend_sched_get_tensor_backend(sched, node);
+ GGML_LOG_DEBUG("node #%3d (%10.10s): %20.20s (%5.5s) [%5.5s %8.8s]:", i, ggml_op_name(node->op), node->name,
+ fmt_size(ggml_nbytes(node)), tensor_backend ? ggml_backend_name(tensor_backend) : "NULL", GET_CAUSE(node));
+ for (int j = 0; j < GGML_MAX_SRC; j++) {
+ struct ggml_tensor * src = node->src[j];
+ if (src == NULL) {
+ continue;
+ }
+ ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
+ GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
+ fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
}
- ggml_backend_t src_backend = ggml_backend_sched_get_tensor_backend(sched, src);
- GGML_LOG_DEBUG(" %20.20s (%5.5s) [%5.5s %8.8s]", src->name,
- fmt_size(ggml_nbytes(src)), src_backend ? ggml_backend_name(src_backend) : "NULL", GET_CAUSE(src));
+ GGML_LOG_DEBUG("\n");
}
- GGML_LOG_DEBUG("\n");
}
}
if (src == NULL) {
continue;
}
- // check if a weight is on a different backend
+ // check if a weight is on a different and incompatible backend
// by starting a new split, the memory of the previously offloaded weights can be reused
if (src->buffer != NULL && src->buffer->usage == GGML_BACKEND_BUFFER_USAGE_WEIGHTS) {
int src_backend_id = tensor_backend_id(src);
- if (src_backend_id != cur_backend_id) {
+ if (src_backend_id != cur_backend_id && !ggml_backend_sched_buffer_supported(sched, src, cur_backend_id)) {
need_new_split = true;
break;
}
int src_backend_id = sched->hv_tensor_backend_ids[id];
bool supported = ggml_backend_sched_buffer_supported(sched, src, cur_backend_id);
if (src_backend_id != cur_backend_id && tensor_id_copy(id, cur_backend_id, 0) == NULL && !supported) {
- //printf("starting new split because of too many inputs: node %s, input %s\n", node->name, src->name);
need_new_split = true;
break;
}
struct ggml_backend_sched * sched = (ggml_backend_sched *) calloc(1, sizeof(struct ggml_backend_sched));
- sched->debug = getenv("GGML_SCHED_DEBUG") != NULL;
+ const char * GGML_SCHED_DEBUG = getenv("GGML_SCHED_DEBUG");
+ sched->debug = GGML_SCHED_DEBUG ? atoi(GGML_SCHED_DEBUG) : 0;
sched->n_backends = n_backends;
sched->n_copies = parallel ? GGML_SCHED_MAX_COPIES : 1;
delete backend;
}
-static ggml_backend_buffer_type_t ggml_backend_blas_get_default_buffer_type(ggml_backend_t backend) {
- return ggml_backend_cpu_buffer_type();
-
- GGML_UNUSED(backend);
-}
-
static enum ggml_status ggml_backend_blas_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
ggml_backend_blas_context * ctx = (ggml_backend_blas_context *)backend->context;
static struct ggml_backend_i blas_backend_i = {
/* .get_name = */ ggml_backend_blas_get_name,
/* .free = */ ggml_backend_blas_free,
- /* .get_default_buffer_type = */ ggml_backend_blas_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_blas_graph_compute,
- /* .supports_op = */ NULL,
- /* .supports_buft = */ NULL,
- /* .offload_op = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
}
static enum ggml_backend_dev_type ggml_backend_blas_device_get_type(ggml_backend_dev_t dev) {
- return GGML_BACKEND_DEVICE_TYPE_CPU;
+ return GGML_BACKEND_DEVICE_TYPE_ACCEL;
GGML_UNUSED(dev);
}
};
}
-static ggml_backend_t ggml_backend_blas_device_init(ggml_backend_dev_t dev, const char * params) {
+static ggml_backend_t ggml_backend_blas_device_init_backend(ggml_backend_dev_t dev, const char * params) {
return ggml_backend_blas_init();
GGML_UNUSED(dev);
GGML_UNUSED(dev);
}
-static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+static ggml_backend_buffer_t ggml_backend_blas_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
return ggml_backend_cpu_buffer_from_ptr(ptr, size);
GGML_UNUSED(dev);
/* .get_memory = */ ggml_backend_blas_device_get_memory,
/* .get_type = */ ggml_backend_blas_device_get_type,
/* .get_props = */ ggml_backend_blas_device_get_props,
- /* .init_backend = */ ggml_backend_blas_device_init,
+ /* .init_backend = */ ggml_backend_blas_device_init_backend,
/* .get_buffer_type = */ ggml_backend_blas_device_get_buffer_type,
/* .get_host_buffer_type = */ NULL,
- /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_ptr,
+ /* .buffer_from_host_ptr = */ ggml_backend_blas_device_buffer_from_host_ptr,
/* .supports_op = */ ggml_backend_blas_device_supports_op,
/* .supports_buft = */ ggml_backend_blas_device_supports_buft,
/* .offload_op = */ NULL,
~ggml_backend_cann_buffer_context() { ACL_CHECK(aclrtFree(dev_ptr)); }
};
-/**
- * @brief Retrieve the name associated with a CANN buffer.
- *
- * This function returns the name of a CANN buffer, which is stored in the
- * context of the buffer.
- *
- * @param buffer The CANN buffer whose name is to be retrieved.
- * @return A pointer to a C-string containing the name of the buffer.
- */
-
-static const char* ggml_backend_cann_buffer_get_name(
- ggml_backend_buffer_t buffer) {
- return "CANN";
-
- GGML_UNUSED(buffer);
-}
-
/**
* @brief Check if a buffer is a CANN buffer.
*
* @param buffer The buffer to check.
* @return true if the buffer is a CANN buffer, false otherwise.
*/
+static bool ggml_backend_buft_is_cann(ggml_backend_buffer_type_t buft);
static bool ggml_backend_buffer_is_cann(
ggml_backend_buffer_t buffer) {
- return buffer->iface.get_name == ggml_backend_cann_buffer_get_name;
+ return ggml_backend_buft_is_cann(buffer->buft);
}
/**
* on a CANN buffer within the backend.
*/
static const ggml_backend_buffer_i ggml_backend_cann_buffer_interface = {
- /* .get_name = */ ggml_backend_cann_buffer_get_name,
/* .free_buffer = */ ggml_backend_cann_buffer_free_buffer,
/* .get_base = */ ggml_backend_cann_buffer_get_base,
/* .init_tensor = */ ggml_backend_cann_buffer_init_tensor,
*/
static const char* ggml_backend_cann_buffer_type_name(
ggml_backend_buffer_type_t buft) {
- return "CANN";
+ ggml_backend_cann_buffer_type_context* buft_ctx =
+ (ggml_backend_cann_buffer_type_context*)buft->context;
- GGML_UNUSED(buft);
+ return buft_ctx->name.c_str();
}
/**
delete backend;
}
-/**
- * @brief Retrieves the default buffer type associated with the CANN backend.
- *
- * This function returns the buffer type specific to the device associated
- * with the CANN backend. It is used to allocate buffers for computations
- * performed by the backend.
- *
- * @param backend Pointer to the CANN backend structure.
- * @return Pointer to the buffer type structure for the CANN backend.
- */
-static ggml_backend_buffer_type_t
-ggml_backend_cann_get_default_buffer_type(ggml_backend_t backend) {
- ggml_backend_cann_context* cann_ctx =
- (ggml_backend_cann_context*)backend->context;
-
- return ggml_backend_cann_buffer_type(cann_ctx->device);
-}
-
/**
* @brief Sets tensor data asynchronously in the CANN backend.
*
static const ggml_backend_i ggml_backend_cann_interface = {
/* .get_name = */ ggml_backend_cann_name,
/* .free = */ ggml_backend_cann_free,
- /* .get_default_buffer_type = */ ggml_backend_cann_get_default_buffer_type,
/* .set_tensor_async = */ ggml_backend_cann_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cann_get_tensor_async,
/* .cpy_tensor_async = */ ggml_backend_cann_cpy_tensor_async,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cann_graph_compute,
- /* .supports_op = */ NULL, // moved to device
- /* .supports_buft = */ NULL, // moved to device
- /* .offload_op = */ NULL, // moved to device
/* .event_record = */ ggml_backend_cann_event_record,
/* .event_wait = */ ggml_backend_cann_event_wait,
};
static enum ggml_backend_dev_type ggml_backend_cann_device_get_type(ggml_backend_dev_t dev) {
GGML_UNUSED(dev);
- return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
}
static void ggml_backend_cann_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
}
};
-static const char * ggml_backend_cuda_buffer_get_name(ggml_backend_buffer_t buffer) {
+static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
- return ctx->name.c_str();
+ delete ctx;
}
static bool ggml_backend_buffer_is_cuda(ggml_backend_buffer_t buffer) {
- return buffer->iface.get_name == ggml_backend_cuda_buffer_get_name;
-}
-
-static void ggml_backend_cuda_buffer_free_buffer(ggml_backend_buffer_t buffer) {
- ggml_backend_cuda_buffer_context * ctx = (ggml_backend_cuda_buffer_context *)buffer->context;
- delete ctx;
+ return buffer->iface.free_buffer == ggml_backend_cuda_buffer_free_buffer;
}
static void * ggml_backend_cuda_buffer_get_base(ggml_backend_buffer_t buffer) {
}
static const ggml_backend_buffer_i ggml_backend_cuda_buffer_interface = {
- /* .get_name = */ ggml_backend_cuda_buffer_get_name,
/* .free_buffer = */ ggml_backend_cuda_buffer_free_buffer,
/* .get_base = */ ggml_backend_cuda_buffer_get_base,
/* .init_tensor = */ ggml_backend_cuda_buffer_init_tensor,
ggml_cuda_set_device(buft_ctx->device);
- size = std::max(size, (size_t)1); // cudaMalloc returns null for size 0
-
void * dev_ptr;
cudaError_t err = ggml_cuda_device_malloc(&dev_ptr, size, buft_ctx->device);
if (err != cudaSuccess) {
}
struct ggml_backend_cuda_split_buffer_type_context {
+ int main_device;
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split;
+ std::string name;
};
struct ggml_backend_cuda_split_buffer_context {
std::vector<ggml_tensor_extra_gpu *> tensor_extras;
};
-static const char * ggml_backend_cuda_split_buffer_get_name(ggml_backend_buffer_t buffer) {
- return GGML_CUDA_NAME "_Split";
-
- GGML_UNUSED(buffer);
-}
-
-static bool ggml_backend_buffer_is_cuda_split(ggml_backend_buffer_t buffer) {
- return buffer->iface.get_name == ggml_backend_cuda_split_buffer_get_name;
- GGML_UNUSED(ggml_backend_buffer_is_cuda_split); // only used in debug builds currently, avoid unused function warning in release builds
-}
static void ggml_backend_cuda_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_cuda_split_buffer_context * ctx = (ggml_backend_cuda_split_buffer_context *)buffer->context;
}
static const ggml_backend_buffer_i ggml_backend_cuda_split_buffer_interface = {
- /* .get_name = */ ggml_backend_cuda_split_buffer_get_name,
/* .free_buffer = */ ggml_backend_cuda_split_buffer_free_buffer,
/* .get_base = */ ggml_backend_cuda_split_buffer_get_base,
/* .init_tensor = */ ggml_backend_cuda_split_buffer_init_tensor,
// cuda split buffer type
static const char * ggml_backend_cuda_split_buffer_type_get_name(ggml_backend_buffer_type_t buft) {
- return GGML_CUDA_NAME "_Split";
+ ggml_backend_cuda_split_buffer_type_context * ctx = (ggml_backend_cuda_split_buffer_type_context *)buft->context;
- GGML_UNUSED(buft);
+ return ctx->name.c_str();
}
static bool ggml_backend_buft_is_cuda_split(ggml_backend_buffer_type_t buft) {
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
};
-ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(const float * tensor_split) {
+ggml_backend_buffer_type_t ggml_backend_cuda_split_buffer_type(int main_device, const float * tensor_split) {
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
- static std::map<std::array<float, GGML_CUDA_MAX_DEVICES>, struct ggml_backend_buffer_type> buft_map;
+ static std::map<std::pair<int, std::array<float, GGML_CUDA_MAX_DEVICES>>, struct ggml_backend_buffer_type> buft_map;
std::array<float, GGML_CUDA_MAX_DEVICES> tensor_split_arr = {};
}
}
- auto it = buft_map.find(tensor_split_arr);
+ auto it = buft_map.find({main_device, tensor_split_arr});
if (it != buft_map.end()) {
return &it->second;
}
+ auto * ctx = new ggml_backend_cuda_split_buffer_type_context{
+ main_device,
+ tensor_split_arr,
+ GGML_CUDA_NAME + std::to_string(main_device) + "_Split",
+ };
struct ggml_backend_buffer_type buft {
/* .iface = */ ggml_backend_cuda_split_buffer_type_interface,
- /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), 0),
- /* .context = */ new ggml_backend_cuda_split_buffer_type_context{tensor_split_arr},
+ /* .device = */ ggml_backend_reg_dev_get(ggml_backend_cuda_reg(), main_device),
+ /* .context = */ ctx,
};
- auto result = buft_map.emplace(tensor_split_arr, buft);
+ auto result = buft_map.emplace(std::make_pair(main_device, tensor_split_arr), buft);
return &result.first->second;
}
GGML_UNUSED(buft);
}
-static const char * ggml_backend_cuda_host_buffer_name(ggml_backend_buffer_t buffer) {
- return GGML_CUDA_NAME "_Host";
-
- GGML_UNUSED(buffer);
-}
-
static void ggml_backend_cuda_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
CUDA_CHECK(cudaFreeHost(buffer->context));
}
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft;
- buffer->iface.get_name = ggml_backend_cuda_host_buffer_name;
buffer->iface.free_buffer = ggml_backend_cuda_host_buffer_free_buffer;
return buffer;
const int64_t src1_padded_col_size = GGML_PAD(ne10, MATRIX_ROW_PADDING);
- const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
+ const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
GGML_ASSERT(!(split && ne02 > 1));
GGML_ASSERT(!(split && ne03 > 1));
GGML_ASSERT(!(split && ne02 < ne12));
}
static void ggml_cuda_mul_mat(ggml_backend_cuda_context & ctx, const ggml_tensor * src0, const ggml_tensor * src1, ggml_tensor * dst) {
- const bool split = ggml_backend_buffer_is_cuda_split(src0->buffer);
+ const bool split = ggml_backend_buft_is_cuda_split(src0->buffer->buft);
bool use_dequantize_mul_mat_vec = ggml_cuda_dmmv_type_supported(src0->type)
&& src1->type == GGML_TYPE_F32 && dst->type == GGML_TYPE_F32
GGML_TENSOR_BINARY_OP_LOCALS
- GGML_ASSERT(!ggml_backend_buffer_is_cuda_split(src0->buffer) && "mul_mat_id does not support split buffers");
+ GGML_ASSERT(!ggml_backend_buft_is_cuda_split(src0->buffer->buft) && "mul_mat_id does not support split buffers");
cudaStream_t stream = ctx.stream();
static bool ggml_cuda_compute_forward(ggml_backend_cuda_context & ctx, struct ggml_tensor * dst) {
// why is this here instead of mul_mat?
- if (dst->src[0] != nullptr && ggml_backend_buffer_is_cuda_split(dst->src[0]->buffer)) {
+ if (dst->src[0] != nullptr && ggml_backend_buft_is_cuda_split(dst->src[0]->buffer->buft)) {
ggml_cuda_set_peer_access(dst->src[1]->ne[1], ctx.device);
}
delete backend;
}
-static ggml_backend_buffer_type_t ggml_backend_cuda_get_default_buffer_type(ggml_backend_t backend) {
- ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
-
- return ggml_backend_cuda_buffer_type(cuda_ctx->device);
-}
-
static void ggml_backend_cuda_set_tensor_async(ggml_backend_t backend, ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
ggml_backend_cuda_context * cuda_ctx = (ggml_backend_cuda_context *)backend->context;
ggml_backend_buffer_t buf = tensor->view_src ? tensor->view_src->buffer : tensor->buffer;
continue;
}
- if (node->src[0] && node->src[0]->buffer && ggml_backend_buffer_is_cuda_split(node->src[0]->buffer)) {
+ if (node->src[0] && node->src[0]->buffer && ggml_backend_buft_is_cuda_split(node->src[0]->buffer->buft)) {
use_cuda_graph = false; // Split buffers are not supported by CUDA graph capture
#ifndef NDEBUG
GGML_LOG_DEBUG("%s: disabling CUDA graphs due to split buffer\n", __func__);
for (int j = 0; j < GGML_MAX_SRC; j++) {
if (node->src[j] != nullptr) {
assert(node->src[j]->buffer);
- assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) || ggml_backend_buffer_is_cuda_split(node->src[j]->buffer));
+ assert(node->src[j]->buffer->buft == ggml_backend_cuda_buffer_type(cuda_ctx->device) ||
+ ggml_backend_buft_is_cuda_split(node->src[j]->buffer->buft));
}
}
#endif
cudaError_t stat = cudaGraphExecUpdate(cuda_ctx->cuda_graph->instance, cuda_ctx->cuda_graph->graph, &result_info);
if (stat == cudaErrorGraphExecUpdateFailure) {
#ifndef NDEBUG
- GGML_LOG_ERROR("%s: CUDA graph update failed\n", __func__);
+ GGML_LOG_DEBUG("%s: CUDA graph update failed\n", __func__);
#endif
// The pre-existing graph exec cannot be updated due to violated constraints
// so instead clear error and re-instantiate
static const ggml_backend_i ggml_backend_cuda_interface = {
/* .get_name = */ ggml_backend_cuda_get_name,
/* .free = */ ggml_backend_cuda_free,
- /* .get_default_buffer_type = */ ggml_backend_cuda_get_default_buffer_type,
/* .set_tensor_async = */ ggml_backend_cuda_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_cuda_get_tensor_async,
/* .cpy_tensor_async = */ ggml_backend_cuda_cpy_tensor_async,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_cuda_graph_compute,
- /* .supports_op = */ NULL, // moved to device
- /* .supports_buft = */ NULL, // moved to device
- /* .offload_op = */ NULL, // moved to device
/* .event_record = */ ggml_backend_cuda_event_record,
/* .event_wait = */ ggml_backend_cuda_event_wait,
};
static enum ggml_backend_dev_type ggml_backend_cuda_device_get_type(ggml_backend_dev_t dev) {
GGML_UNUSED(dev);
- return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
}
static void ggml_backend_cuda_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
};
}
-static ggml_backend_t ggml_backend_cuda_device_init(ggml_backend_dev_t dev, const char * params) {
+static ggml_backend_t ggml_backend_cuda_device_init_backend(ggml_backend_dev_t dev, const char * params) {
GGML_UNUSED(params);
ggml_backend_cuda_device_context * ctx = (ggml_backend_cuda_device_context *)dev->context;
return ggml_backend_cuda_init(ctx->device);
return ggml_backend_cuda_host_buffer_type();
}
-static ggml_backend_buffer_t ggml_backend_cuda_device_buffer_from_host_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
- GGML_UNUSED(dev);
- GGML_UNUSED(ptr);
- GGML_UNUSED(size);
- GGML_UNUSED(max_tensor_size);
- return nullptr;
-}
-
// TODO: move these functions here
static bool ggml_backend_cuda_device_supports_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *) dev->context;
+ // split buffers can only be used with GGML_OP_MUL_MAT
+ if (op->op != GGML_OP_MUL_MAT) {
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
+ if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda_split(op->src[i]->buffer->buft)) {
+ return false;
+ }
+ }
+ }
+
+ // check if all the sources are allocated on this device
+ for (int i = 0; i < GGML_MAX_SRC; i++) {
+ if (op->src[i] && op->src[i]->buffer && ggml_backend_buft_is_cuda(op->src[i]->buffer->buft)) {
+ ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)op->src[i]->buffer->buft->context;
+ if (buft_ctx->device != dev_ctx->device) {
+ return false;
+ }
+ }
+ }
+
switch (op->op) {
case GGML_OP_UNARY:
switch (ggml_get_unary_op(op)) {
}
static bool ggml_backend_cuda_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
- if (ggml_backend_buft_is_cuda_split(buft)) {
- return true;
- }
+ return (ggml_backend_buft_is_cuda(buft) || ggml_backend_buft_is_cuda_split(buft)) && buft->device == dev;
+}
- if (ggml_backend_buft_is_cuda(buft)) {
- ggml_backend_cuda_device_context * dev_ctx = (ggml_backend_cuda_device_context *)dev->context;
- ggml_backend_cuda_buffer_type_context * buft_ctx = (ggml_backend_cuda_buffer_type_context *)buft->context;
- return buft_ctx->device == dev_ctx->device;
+static int64_t get_op_batch_size(const ggml_tensor * op) {
+ switch (op->op) {
+ case GGML_OP_GET_ROWS:
+ return 0;
+ case GGML_OP_MUL_MAT:
+ return op->ne[1];
+ case GGML_OP_MUL_MAT_ID:
+ case GGML_OP_ROPE:
+ return op->ne[2];
+ default:
+ return ggml_nrows(op);
}
-
- return false;
}
static bool ggml_backend_cuda_device_offload_op(ggml_backend_dev_t dev, const ggml_tensor * op) {
const int min_batch_size = 32;
- return (op->ne[1] >= min_batch_size && op->op != GGML_OP_GET_ROWS) ||
- (op->ne[2] >= min_batch_size && op->op == GGML_OP_MUL_MAT_ID);
+ return get_op_batch_size(op) >= min_batch_size;
GGML_UNUSED(dev);
}
/* .get_memory = */ ggml_backend_cuda_device_get_memory,
/* .get_type = */ ggml_backend_cuda_device_get_type,
/* .get_props = */ ggml_backend_cuda_device_get_props,
- /* .init_backend = */ ggml_backend_cuda_device_init,
+ /* .init_backend = */ ggml_backend_cuda_device_init_backend,
/* .get_buffer_type = */ ggml_backend_cuda_device_get_buffer_type,
/* .get_host_buffer_type = */ ggml_backend_cuda_device_get_host_buffer_type,
- /* .buffer_from_host_ptr = */ ggml_backend_cuda_device_buffer_from_host_ptr,
+ /* .buffer_from_host_ptr = */ NULL,
/* .supports_op = */ ggml_backend_cuda_device_supports_op,
/* .supports_buft = */ ggml_backend_cuda_device_supports_buft,
/* .offload_op = */ ggml_backend_cuda_device_offload_op,
}
}
-static const char * ggml_backend_kompute_buffer_get_name(ggml_backend_buffer_t buffer) {
- auto * ctx = static_cast<ggml_backend_kompute_buffer_type_context *>(buffer->buft->context);
- return ctx->name.c_str();
-}
-
static void ggml_backend_kompute_buffer_free_buffer(ggml_backend_buffer_t buffer) {
auto * memory = (ggml_vk_memory *)buffer->context;
if (ggml_vk_has_device()) {
}
static ggml_backend_buffer_i ggml_backend_kompute_buffer_i = {
- /* .get_name = */ ggml_backend_kompute_buffer_get_name,
/* .free_buffer = */ ggml_backend_kompute_buffer_free_buffer,
/* .get_base = */ ggml_backend_kompute_buffer_get_base,
/* .init_tensor = */ NULL,
delete backend;
}
-static ggml_backend_buffer_type_t ggml_backend_kompute_get_default_buffer_type(ggml_backend_t backend) {
- auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
- return ggml_backend_kompute_buffer_type(ctx->device);
-}
-
static ggml_status ggml_backend_kompute_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
auto * ctx = static_cast<ggml_kompute_context *>(backend->context);
ggml_vk_graph_compute(ctx, cgraph);
static struct ggml_backend_i kompute_backend_i = {
/* .get_name = */ ggml_backend_kompute_name,
/* .free = */ ggml_backend_kompute_free,
- /* .get_default_buffer_type = */ ggml_backend_kompute_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_kompute_graph_compute,
- /* .supports_op = */ ggml_backend_kompute_supports_op,
- /* .supports_buft = */ ggml_backend_kompute_supports_buft,
- /* .offload_op = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
// backend interface
-static const char * ggml_backend_metal_buffer_get_name(ggml_backend_buffer_t buffer) {
- return "Metal";
-
- UNUSED(buffer);
-}
-
static void ggml_backend_metal_buffer_free_buffer(ggml_backend_buffer_t buffer) {
struct ggml_backend_metal_buffer_context * ctx = (struct ggml_backend_metal_buffer_context *)buffer->context;
}
static struct ggml_backend_buffer_i ggml_backend_metal_buffer_i = {
- /* .get_name = */ ggml_backend_metal_buffer_get_name,
/* .free_buffer = */ ggml_backend_metal_buffer_free_buffer,
/* .get_base = */ ggml_backend_metal_buffer_get_base,
/* .init_tensor = */ NULL,
return &ggml_backend_buffer_type_metal;
}
+static const char * ggml_backend_metal_buffer_from_ptr_type_get_name(ggml_backend_buffer_type_t buft) {
+ return "Metal_Mapped";
+
+ UNUSED(buft);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_metal_buffer_from_ptr_type(void) {
+ static struct ggml_backend_buffer_type ggml_backend_buffer_from_ptr_type_metal = {
+ /* .iface = */ {
+ /* .get_name = */ ggml_backend_metal_buffer_from_ptr_type_get_name,
+ /* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
+ /* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
+ /* .get_max_size = */ ggml_backend_metal_buffer_type_get_max_size,
+ /* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
+ /* .is_host = */ ggml_backend_metal_buffer_type_is_host,
+ },
+ /* .device = */ &g_ggml_backend_metal_device,
+ /* .context = */ NULL,
+ };
+
+ return &ggml_backend_buffer_from_ptr_type_metal;
+}
+
// TODO: obsoleted by ggml_backend_metal_device_buffer_from_ptr
ggml_backend_buffer_t ggml_backend_metal_buffer_from_ptr(void * data, size_t size, size_t max_size) {
struct ggml_backend_metal_buffer_context * ctx = calloc(1, sizeof(struct ggml_backend_metal_buffer_context));
}
}
- return ggml_backend_buffer_init(ggml_backend_metal_buffer_type(), ggml_backend_metal_buffer_i, ctx, size);
+ return ggml_backend_buffer_init(ggml_backend_metal_buffer_from_ptr_type(), ggml_backend_metal_buffer_i, ctx, size);
}
// backend
free(backend);
}
-static ggml_backend_buffer_type_t ggml_backend_metal_get_default_buffer_type(ggml_backend_t backend) {
- return ggml_backend_metal_buffer_type();
-
- UNUSED(backend);
-}
-
static enum ggml_status ggml_backend_metal_graph_compute(ggml_backend_t backend, struct ggml_cgraph * cgraph) {
return ggml_metal_graph_compute(backend, cgraph);
}
static struct ggml_backend_i ggml_backend_metal_i = {
/* .get_name = */ ggml_backend_metal_name,
/* .free = */ ggml_backend_metal_free,
- /* .get_default_buffer_type = */ ggml_backend_metal_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_metal_graph_compute,
- /* .supports_op = */ NULL,
- /* .supports_buft = */ NULL,
- /* .offload_op = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
}
static enum ggml_backend_dev_type ggml_backend_metal_device_get_type(ggml_backend_dev_t dev) {
- return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
GGML_UNUSED(dev);
}
std::shared_ptr<socket_t> sock;
std::unordered_map<ggml_backend_buffer_t, void *> base_cache;
uint64_t remote_ptr;
- std::string name;
};
// RPC helper functions
return sock;
}
-static const char * ggml_backend_rpc_buffer_get_name(ggml_backend_buffer_t buffer) {
- ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
- return ctx->name.c_str();
-}
-
static void ggml_backend_rpc_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_rpc_buffer_context * ctx = (ggml_backend_rpc_buffer_context *)buffer->context;
rpc_msg_free_buffer_req request = {ctx->remote_ptr};
}
static ggml_backend_buffer_i ggml_backend_rpc_buffer_interface = {
- /* .get_name = */ ggml_backend_rpc_buffer_get_name,
/* .free_buffer = */ ggml_backend_rpc_buffer_free_buffer,
/* .get_base = */ ggml_backend_rpc_buffer_get_base,
/* .init_tensor = */ ggml_backend_rpc_buffer_init_tensor,
if (response.remote_ptr != 0) {
ggml_backend_buffer_t buffer = ggml_backend_buffer_init(buft,
ggml_backend_rpc_buffer_interface,
- new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr, "RPC[" + std::string(buft_ctx->endpoint) + "]"},
+ new ggml_backend_rpc_buffer_context{sock, {}, response.remote_ptr},
response.remote_size);
return buffer;
} else {
delete backend;
}
-static ggml_backend_buffer_type_t ggml_backend_rpc_get_default_buffer_type(ggml_backend_t backend) {
- ggml_backend_rpc_context * ctx = (ggml_backend_rpc_context *)backend->context;
- return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
-}
-
static void ggml_backend_rpc_synchronize(ggml_backend_t backend) {
UNUSED(backend);
// this is no-op because we don't have any async operations
static ggml_backend_i ggml_backend_rpc_interface = {
/* .get_name = */ ggml_backend_rpc_name,
/* .free = */ ggml_backend_rpc_free,
- /* .get_default_buffer_type = */ ggml_backend_rpc_get_default_buffer_type,
/* .set_tensor_async = */ NULL,
/* .get_tensor_async = */ NULL,
/* .cpy_tensor_async = */ NULL,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
- /* .supports_op = */ NULL,
- /* .supports_buft = */ NULL,
- /* .offload_op = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
// TODO: obtain value from the server
- return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
UNUSED(dev);
}
}
};
-static const char * ggml_backend_sycl_buffer_get_name(ggml_backend_buffer_t buffer) {
- ggml_backend_sycl_buffer_context * ctx = (ggml_backend_sycl_buffer_context *)buffer->context;
- return ctx->name.c_str();
-}
+static const char * ggml_backend_sycl_buffer_type_get_name(ggml_backend_buffer_type_t buft);
static bool ggml_backend_buffer_is_sycl(ggml_backend_buffer_t buffer) {
- return buffer->iface.get_name == ggml_backend_sycl_buffer_get_name;
+ return buffer->buft->iface.get_name == ggml_backend_sycl_buffer_type_get_name;
}
static void
}
static const ggml_backend_buffer_i ggml_backend_sycl_buffer_interface = {
- /* .get_name = */ ggml_backend_sycl_buffer_get_name,
/* .free_buffer = */ ggml_backend_sycl_buffer_free_buffer,
/* .get_base = */ ggml_backend_sycl_buffer_get_base,
/* .init_tensor = */ ggml_backend_sycl_buffer_init_tensor,
std::vector<queue_ptr> streams;
};
-static const char * ggml_backend_sycl_split_buffer_get_name(ggml_backend_buffer_t buffer) {
- return GGML_SYCL_NAME "_Split";
-
- GGML_UNUSED(buffer);
-}
-
-static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
- return buffer->iface.get_name == ggml_backend_sycl_split_buffer_get_name;
-}
-
static void ggml_backend_sycl_split_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_backend_sycl_split_buffer_context * ctx = (ggml_backend_sycl_split_buffer_context *)buffer->context;
delete ctx;
}
static struct ggml_backend_buffer_i ggml_backend_sycl_split_buffer_interface = {
- /* .get_name = */ ggml_backend_sycl_split_buffer_get_name,
/* .free_buffer = */ ggml_backend_sycl_split_buffer_free_buffer,
/* .get_base = */ ggml_backend_sycl_split_buffer_get_base,
/* .init_tensor = */ ggml_backend_sycl_split_buffer_init_tensor,
GGML_UNUSED(buft);
}
+static bool ggml_backend_buffer_is_sycl_split(ggml_backend_buffer_t buffer) {
+ return buffer->buft->iface.get_name == ggml_backend_sycl_split_buffer_type_get_name;
+}
+
static ggml_backend_buffer_t ggml_backend_sycl_split_buffer_type_alloc_buffer(ggml_backend_buffer_type_t buft, size_t size) {
// since we don't know the exact split after rounding, we cannot allocate the device buffers at this point
// instead, we allocate them for each tensor separately in init_tensor
GGML_UNUSED(buft);
}
-static const char * ggml_backend_sycl_host_buffer_name(ggml_backend_buffer_t buffer) {
- return GGML_SYCL_NAME "_Host";
-
- GGML_UNUSED(buffer);
-}
-
static void ggml_backend_sycl_host_buffer_free_buffer(ggml_backend_buffer_t buffer) {
ggml_sycl_host_free(buffer->context);
}
// FIXME: this is a hack to avoid having to implement a new buffer type
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft;
- buffer->iface.get_name = ggml_backend_sycl_host_buffer_name;
buffer->iface.free_buffer = ggml_backend_sycl_host_buffer_free_buffer;
return buffer;
delete backend;
}
-
-static ggml_backend_buffer_type_t ggml_backend_sycl_get_default_buffer_type(ggml_backend_t backend) {
- ggml_backend_sycl_context * sycl_ctx = (ggml_backend_sycl_context *)backend->context;
- return ggml_backend_sycl_buffer_type(sycl_ctx->device);
-}
-
static void ggml_backend_sycl_set_tensor_async(ggml_backend_t backend,
ggml_tensor *tensor,
const void *data, size_t offset,
static ggml_backend_i ggml_backend_sycl_interface = {
/* .get_name = */ ggml_backend_sycl_get_name,
/* .free = */ ggml_backend_sycl_free,
- /* .get_default_buffer_type = */ ggml_backend_sycl_get_default_buffer_type,
/* .set_tensor_async = */ ggml_backend_sycl_set_tensor_async,
/* .get_tensor_async = */ ggml_backend_sycl_get_tensor_async,
/* .cpy_tensor_async = */ NULL, // ggml_backend_sycl_cpy_tensor_async,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_sycl_graph_compute,
- /* .supports_op = */ NULL, // moved to device
- /* .supports_buft = */ NULL, // moved to device
- /* .offload_op = */ NULL, // moved to device
/* .event_record = */ ggml_backend_sycl_event_record,
/* .event_wait = */ ggml_backend_sycl_event_wait,
};
static enum ggml_backend_dev_type ggml_backend_sycl_device_get_type(ggml_backend_dev_t dev) {
GGML_UNUSED(dev);
- return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
}
static void ggml_backend_sycl_device_get_props(ggml_backend_dev_t dev, ggml_backend_dev_props * props) {
return ctx->devices[index];
}
-static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name)
-{
+static void *ggml_backend_sycl_reg_get_proc_address(ggml_backend_reg_t reg, const char *name) {
GGML_UNUSED(reg);
- if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
- return (void *)ggml_backend_sycl_split_buffer_type;
- }
+
+ // TODO: update to the current function signature
+ //if (strcmp(name, "ggml_backend_split_buffer_type") == 0) {
+ // return (void *)ggml_backend_sycl_split_buffer_type;
+ //}
+
// SYCL doesn't support registering host memory, left here for reference
// "ggml_backend_register_host_buffer"
// "ggml_backend_unregister_host_buffer"
// device backend
-static const char * ggml_backend_vk_buffer_get_name(ggml_backend_buffer_t buffer) {
- ggml_backend_vk_buffer_context * ctx = (ggml_backend_vk_buffer_context *)buffer->context;
- return ctx->name.c_str();
-}
-
static bool ggml_backend_buffer_is_vk(ggml_backend_buffer_t buffer) {
- return buffer->iface.get_name == ggml_backend_vk_buffer_get_name;
+ return buffer->buft->iface.get_name == ggml_backend_vk_buffer_type_name;
}
static void ggml_backend_vk_buffer_free_buffer(ggml_backend_buffer_t buffer) {
}
static ggml_backend_buffer_i ggml_backend_vk_buffer_interface = {
- /* .get_name = */ ggml_backend_vk_buffer_get_name,
/* .free_buffer = */ ggml_backend_vk_buffer_free_buffer,
/* .get_base = */ ggml_backend_vk_buffer_get_base,
/* .init_tensor = */ ggml_backend_vk_buffer_init_tensor,
ggml_backend_buffer_t buffer = ggml_backend_cpu_buffer_from_ptr(ptr, size);
buffer->buft = buft;
- buffer->iface.get_name = ggml_backend_vk_host_buffer_name;
buffer->iface.free_buffer = ggml_backend_vk_host_buffer_free_buffer;
return buffer;
static ggml_backend_i ggml_backend_vk_interface = {
/* .get_name = */ ggml_backend_vk_name,
/* .free = */ ggml_backend_vk_free,
- /* .get_default_buffer_type = */ ggml_backend_vk_get_default_buffer_type,
/* .set_tensor_async = */ NULL, // ggml_backend_vk_set_tensor_async,
/* .get_tensor_async = */ NULL, // ggml_backend_vk_get_tensor_async,
/* .cpy_tensor_async = */ NULL, // ggml_backend_vk_cpy_tensor_async,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_vk_graph_compute,
- /* .supports_op = */ NULL,
- /* .supports_buft = */ NULL,
- /* .offload_op = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
};
//////////////////////////
struct ggml_backend_vk_device_context {
- int device;
+ size_t device;
std::string name;
std::string description;
};
static enum ggml_backend_dev_type ggml_backend_vk_device_get_type(ggml_backend_dev_t dev) {
UNUSED(dev);
- return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+ return GGML_BACKEND_DEVICE_TYPE_GPU;
}
static void ggml_backend_vk_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
props->type = ggml_backend_vk_device_get_type(dev);
ggml_backend_vk_device_get_memory(dev, &props->memory_free, &props->memory_total);
props->caps = {
- /* async */ false,
- /* host_buffer */ true,
- /* events */ false,
+ /* .async = */ false,
+ /* .host_buffer = */ true,
+ /* .buffer_from_host_ptr = */ false,
+ /* .events = */ false,
};
}
static std::mutex mutex;
std::lock_guard<std::mutex> lock(mutex);
if (!initialized) {
- for (size_t i = 0; i < ggml_backend_vk_get_device_count(); i++) {
+ for (int i = 0; i < ggml_backend_vk_get_device_count(); i++) {
ggml_backend_vk_device_context * ctx = new ggml_backend_vk_device_context;
char desc[256];
ggml_backend_vk_get_device_description(i, desc, sizeof(desc));
if (cur_end + size_needed + GGML_OBJECT_SIZE > ctx->mem_size) {
GGML_LOG_WARN("%s: not enough space in the context's memory pool (needed %zu, available %zu)\n",
__func__, cur_end + size_needed + GGML_OBJECT_SIZE, ctx->mem_size);
- assert(false);
+#ifndef NDEBUG
+ GGML_ABORT("not enough space in the context's memory pool");
+#endif
return NULL;
}