}
// utils
-ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
- GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
-
- size_t alignment = ggml_backend_buft_get_alignment(buft);
-
- size_t nbytes = 0;
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
- if (t->data == NULL && t->view_src == NULL) {
- nbytes += GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
- }
- }
-
- if (nbytes == 0) {
- // all the tensors in the context are already allocated
-#ifndef NDEBUG
- fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
-#endif
- return NULL;
- }
- ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, nbytes);
+static bool alloc_tensor_range(struct ggml_context * ctx,
+ struct ggml_tensor * first, struct ggml_tensor * last,
+ ggml_backend_buffer_type_t buft, size_t size,
+ ggml_backend_buffer_t ** buffers, size_t * n_buffers) {
+ ggml_backend_buffer_t buffer = ggml_backend_buft_alloc_buffer(buft, size);
if (buffer == NULL) {
- // failed to allocate buffer
#ifndef NDEBUG
- fprintf(stderr, "%s: failed to allocate buffer\n", __func__);
+ fprintf(stderr, "%s: failed to allocate %s buffer of size %zu\n", __func__, ggml_backend_buft_name(buft), size);
#endif
- return NULL;
+ for (size_t i = 0; i < *n_buffers; i++) {
+ ggml_backend_buffer_free(*buffers[i]);
+ }
+ free(buffers);
+ return false;
}
ggml_tallocr_t tallocr = ggml_tallocr_new_from_buffer(buffer);
- for (struct ggml_tensor * t = ggml_get_first_tensor(ctx); t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+ for (struct ggml_tensor * t = first; t != last; t = ggml_get_next_tensor(ctx, t)) {
if (t->data == NULL) {
if (t->view_src == NULL) {
ggml_tallocr_alloc(tallocr, t);
ggml_tallocr_free(tallocr);
+ *buffers = realloc(*buffers, sizeof(ggml_backend_buffer_t) * (*n_buffers + 1));
+ (*buffers)[(*n_buffers)++] = buffer;
+
+ return true;
+}
+
+ggml_backend_buffer_t ggml_backend_alloc_ctx_tensors_from_buft(struct ggml_context * ctx, ggml_backend_buffer_type_t buft) {
+ GGML_ASSERT(ggml_get_no_alloc(ctx) == true);
+
+ size_t alignment = ggml_backend_buft_get_alignment(buft);
+ size_t max_size = ggml_backend_buft_get_max_size(buft);
+
+ ggml_backend_buffer_t * buffers = NULL;
+ size_t n_buffers = 0;
+
+ size_t cur_buf_size = 0;
+ struct ggml_tensor * first = ggml_get_first_tensor(ctx);
+ for (struct ggml_tensor * t = first; t != NULL; t = ggml_get_next_tensor(ctx, t)) {
+ size_t this_size = 0;
+ if (t->data == NULL && t->view_src == NULL) {
+ this_size = GGML_PAD(ggml_backend_buft_get_alloc_size(buft, t), alignment);
+ }
+
+ if (this_size > max_size) {
+ // tensor is too large to fit in a single buffer
+ fprintf(stderr, "%s: tensor %s is too large to fit in a %s buffer (tensor size: %zu, max buffer size: %zu)\n",
+ __func__, t->name,
+ ggml_backend_buft_name(buft),
+ this_size, max_size);
+ for (size_t i = 0; i < n_buffers; i++) {
+ ggml_backend_buffer_free(buffers[i]);
+ }
+ free(buffers);
+ return NULL;
+ }
+
+ if ((cur_buf_size + this_size) > max_size) {
+ // allocate tensors in the current buffer
+ if (!alloc_tensor_range(ctx, first, t, buft, cur_buf_size, &buffers, &n_buffers)) {
+ return NULL;
+ }
+ first = t;
+ cur_buf_size = this_size;
+ } else {
+ cur_buf_size += this_size;
+ }
+ }
+
+ // allocate remaining tensors
+ if (cur_buf_size > 0) {
+ if (!alloc_tensor_range(ctx, first, NULL, buft, cur_buf_size, &buffers, &n_buffers)) {
+ return NULL;
+ }
+ }
+
+ if (n_buffers == 0) {
+ // all the tensors in the context are already allocated
+#ifndef NDEBUG
+ fprintf(stderr, "%s: all tensors in the context are already allocated\n", __func__);
+#endif
+ return NULL;
+ }
+
+ ggml_backend_buffer_t buffer;
+ if (n_buffers == 1) {
+ buffer = buffers[0];
+ } else {
+ buffer = ggml_backend_multi_buffer_alloc_buffer(buffers, n_buffers);
+ }
+ free(buffers);
return buffer;
}
const char * (*GGML_CALL get_name) (ggml_backend_buffer_type_t buft);
ggml_backend_buffer_t (*GGML_CALL alloc_buffer) (ggml_backend_buffer_type_t buft, size_t size);
size_t (*GGML_CALL get_alignment) (ggml_backend_buffer_type_t buft); // tensor alignment
+ size_t (*GGML_CALL get_max_size) (ggml_backend_buffer_type_t buft); // allocation max size
size_t (*GGML_CALL get_alloc_size) (ggml_backend_buffer_type_t buft, const struct ggml_tensor * tensor); // data size needed to allocate the tensor, including padding
bool (*GGML_CALL supports_backend)(ggml_backend_buffer_type_t buft, ggml_backend_t backend); // check if the buffer type is usable by the backend
// check if tensor data is in host memory
// do not use directly, use ggml_backend_tensor_copy instead
bool ggml_backend_buffer_copy_tensor(const struct ggml_tensor * src, struct ggml_tensor * dst);
+ // buffer that contains a collection of buffers
+ GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers);
+ GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer);
+ GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage);
+
//
// Backend
//
return buft->iface.get_alignment(buft);
}
+size_t ggml_backend_buft_get_max_size(ggml_backend_buffer_type_t buft) {
+ // get_max_size is optional, defaults to SIZE_MAX
+ if (buft->iface.get_max_size) {
+ return buft->iface.get_max_size(buft);
+ }
+ return SIZE_MAX;
+}
+
GGML_CALL size_t ggml_backend_buft_get_alloc_size(ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor) {
// get_alloc_size is optional, defaults to ggml_nbytes
if (buft->iface.get_alloc_size) {
size_t size) {
ggml_backend_buffer_t buffer = malloc(sizeof(struct ggml_backend_buffer));
- GGML_ASSERT(iface.get_base != NULL);
-
(*buffer) = (struct ggml_backend_buffer) {
/* .interface = */ iface,
/* .buft = */ buft,
return ggml_backend_buft_get_alignment(ggml_backend_buffer_get_type(buffer));
}
+size_t ggml_backend_buffer_get_max_size(ggml_backend_buffer_t buffer) {
+ return ggml_backend_buft_get_max_size(ggml_backend_buffer_get_type(buffer));
+}
+
size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor) {
return ggml_backend_buft_get_alloc_size(ggml_backend_buffer_get_type(buffer), tensor);
}
void ggml_backend_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
buffer->usage = usage;
+
+ // FIXME: add a generic callback to the buffer interface
+ if (ggml_backend_buffer_is_multi_buffer(buffer)) {
+ ggml_backend_multi_buffer_set_usage(buffer, usage);
+ }
}
ggml_backend_buffer_type_t ggml_backend_buffer_get_type(ggml_backend_buffer_t buffer) {
return ggml_backend_buft_get_alignment(ggml_backend_get_default_buffer_type(backend));
}
+size_t ggml_backend_get_max_size(ggml_backend_t backend) {
+ return ggml_backend_buft_get_max_size(ggml_backend_get_default_buffer_type(backend));
+}
+
void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size) {
GGML_ASSERT(tensor->data != NULL && "tensor not allocated");
GGML_ASSERT(offset + size <= ggml_nbytes(tensor) && "tensor write out of bounds");
extern GGML_CALL ggml_backend_buffer_type_t ggml_backend_metal_buffer_type(void);
ggml_backend_register("Metal", ggml_backend_reg_metal_init, ggml_backend_metal_buffer_type(), NULL);
#endif
+
+#ifdef GGML_USE_VULKAN
+ extern GGML_CALL int ggml_backend_vk_reg_devices(void);
+ ggml_backend_vk_reg_devices();
+#endif
}
GGML_CALL void ggml_backend_register(const char * name, ggml_backend_init_fn init_fn, ggml_backend_buffer_type_t default_buffer_type, void * user_data) {
/* .get_name = */ ggml_backend_cpu_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
/* .get_name = */ ggml_backend_cpu_hbm_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_cpu_hbm_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type_get_alignment,
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_cpu_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type_is_host,
GGML_UNUSED(user_data);
}
+// multi-buffer buffer
+
+struct ggml_backend_multi_buffer_context {
+ ggml_backend_buffer_t * buffers;
+ size_t n_buffers;
+};
+
+typedef struct ggml_backend_multi_buffer_context * ggml_backend_multi_buffer_context_t;
+
+GGML_CALL static const char * ggml_backend_multi_buffer_get_name(ggml_backend_buffer_t buffer) {
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
+
+ return ctx->buffers[0]->iface.get_name(ctx->buffers[0]);
+}
+
+GGML_CALL static void ggml_backend_multi_buffer_free_buffer(ggml_backend_buffer_t buffer) {
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
+ ggml_backend_buffer_free(ctx->buffers[i]);
+ }
+
+ free(ctx->buffers);
+ free(ctx);
+}
+
+GGML_CALL static void ggml_backend_multi_buffer_clear(ggml_backend_buffer_t buffer, uint8_t value) {
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
+ ggml_backend_buffer_clear(ctx->buffers[i], value);
+ }
+}
+
+static struct ggml_backend_buffer_i ggml_backend_multi_buffer_context_interface(void) {
+ static struct ggml_backend_buffer_i multi_backend_buffer_i = {
+ /* .get_name = */ ggml_backend_multi_buffer_get_name,
+ /* .free_buffer = */ ggml_backend_multi_buffer_free_buffer,
+ /* .get_base = */ NULL,
+ /* .init_tensor = */ NULL,
+ /* .set_tensor = */ NULL,
+ /* .get_tensor = */ NULL,
+ /* .cpy_tensor = */ NULL,
+ /* .clear = */ ggml_backend_multi_buffer_clear,
+ /* .reset = */ NULL,
+ };
+
+ return multi_backend_buffer_i;
+}
+
+GGML_CALL ggml_backend_buffer_t ggml_backend_multi_buffer_alloc_buffer(ggml_backend_buffer_t * buffers, size_t n_buffers) {
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) malloc(sizeof(struct ggml_backend_multi_buffer_context));
+ ctx->n_buffers = n_buffers;
+ ctx->buffers = (ggml_backend_buffer_t *) malloc(n_buffers * sizeof(ggml_backend_buffer_t));
+
+ size_t total_size = 0;
+ for (size_t i = 0; i < n_buffers; i++) {
+ ctx->buffers[i] = buffers[i];
+ total_size += ggml_backend_buffer_get_size(buffers[i]);
+ }
+
+ return ggml_backend_buffer_init(buffers[0]->buft, ggml_backend_multi_buffer_context_interface(), ctx, total_size);
+}
+
+GGML_CALL bool ggml_backend_buffer_is_multi_buffer(ggml_backend_buffer_t buffer) {
+ return buffer->iface.get_name == ggml_backend_multi_buffer_get_name;
+}
+
+GGML_CALL void ggml_backend_multi_buffer_set_usage(ggml_backend_buffer_t buffer, enum ggml_backend_buffer_usage usage) {
+ GGML_ASSERT(ggml_backend_buffer_is_multi_buffer(buffer));
+ ggml_backend_multi_buffer_context_t ctx = (ggml_backend_multi_buffer_context_t) buffer->context;
+ for (size_t i = 0; i < ctx->n_buffers; i++) {
+ ggml_backend_buffer_set_usage(ctx->buffers[i], usage);
+ }
+}
+
// scheduler
GGML_API const char * ggml_backend_buft_name (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL ggml_backend_buffer_t ggml_backend_buft_alloc_buffer (ggml_backend_buffer_type_t buft, size_t size);
GGML_API size_t ggml_backend_buft_get_alignment (ggml_backend_buffer_type_t buft);
+ GGML_API size_t ggml_backend_buft_get_max_size (ggml_backend_buffer_type_t buft);
GGML_API GGML_CALL size_t ggml_backend_buft_get_alloc_size (ggml_backend_buffer_type_t buft, struct ggml_tensor * tensor);
GGML_API bool ggml_backend_buft_supports_backend(ggml_backend_buffer_type_t buft, ggml_backend_t backend);
GGML_API bool ggml_backend_buft_is_host (ggml_backend_buffer_type_t buft);
GGML_API size_t ggml_backend_buffer_get_size (ggml_backend_buffer_t buffer);
GGML_API GGML_CALL void ggml_backend_buffer_init_tensor (ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API size_t ggml_backend_buffer_get_alignment (ggml_backend_buffer_t buffer);
+ GGML_API size_t ggml_backend_buffer_get_max_size (ggml_backend_buffer_t buffer);
GGML_API size_t ggml_backend_buffer_get_alloc_size(ggml_backend_buffer_t buffer, struct ggml_tensor * tensor);
GGML_API void ggml_backend_buffer_clear (ggml_backend_buffer_t buffer, uint8_t value);
GGML_API bool ggml_backend_buffer_is_host (ggml_backend_buffer_t buffer);
GGML_API ggml_backend_buffer_type_t ggml_backend_get_default_buffer_type(ggml_backend_t backend);
GGML_API ggml_backend_buffer_t ggml_backend_alloc_buffer(ggml_backend_t backend, size_t size);
GGML_API size_t ggml_backend_get_alignment(ggml_backend_t backend);
+ GGML_API size_t ggml_backend_get_max_size(ggml_backend_t backend);
GGML_API void ggml_backend_tensor_set_async(ggml_backend_t backend, struct ggml_tensor * tensor, const void * data, size_t offset, size_t size);
GGML_API void ggml_backend_tensor_get_async(ggml_backend_t backend, const struct ggml_tensor * tensor, void * data, size_t offset, size_t size);
/* .get_name = */ ggml_backend_cuda_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_buffer_type_get_alignment,
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_cuda_buffer_type_supports_backend,
/* .is_host = */ NULL,
/* .get_name = */ ggml_backend_cuda_split_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_split_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cuda_split_buffer_type_get_alignment,
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cuda_split_buffer_type_get_alloc_size,
/* .supports_backend = */ ggml_backend_cuda_split_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_cuda_split_buffer_type_is_host,
/* .get_name = */ ggml_backend_cuda_host_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_cuda_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
/* .get_name = */ ggml_backend_metal_buffer_type_get_name,
/* .alloc_buffer = */ ggml_backend_metal_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_metal_buffer_type_get_alignment,
+ /* .get_max_size = */ NULL, // TODO: return device.maxBufferLength
/* .get_alloc_size = */ NULL, // defaults to ggml_nbytes
/* .supports_backend = */ ggml_backend_metal_buffer_type_supports_backend,
/* .is_host = */ ggml_backend_metal_buffer_type_is_host,
/* .get_name = */ ggml_backend_opencl_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_opencl_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_opencl_buffer_type_get_alignment,
+ /* .get_max_size = */ NULL, // TODO: return from device info
/* .get_alloc_size = */ NULL,
/* .supports_backend = */ ggml_backend_opencl_buffer_type_supports_backend,
/* .is_host = */ NULL,
/* .get_name = */ ggml_backend_opencl_host_buffer_type_name,
/* .alloc_buffer = */ ggml_backend_opencl_host_buffer_type_alloc_buffer,
/* .get_alignment = */ ggml_backend_cpu_buffer_type()->iface.get_alignment,
+ /* .get_max_size = */ NULL, // defaults to SIZE_MAX
/* .get_alloc_size = */ ggml_backend_cpu_buffer_type()->iface.get_alloc_size,
/* .supports_backend = */ ggml_backend_cpu_buffer_type()->iface.supports_backend,
/* .is_host = */ ggml_backend_cpu_buffer_type()->iface.is_host,
#include "ggml-cuda.h"
#elif defined(GGML_USE_CLBLAST)
#include "ggml-opencl.h"
+#elif defined(GGML_USE_VULKAN)
+#include "ggml-vulkan.h"
#elif defined(GGML_USE_SYCL)
#include "ggml-sycl.h"
#endif
ggml_init_cublas();
#elif defined(GGML_USE_CLBLAST)
ggml_cl_init();
+#elif defined(GGML_USE_VULKAN)
+ ggml_vk_init();
#elif defined(GGML_USE_SYCL)
ggml_init_sycl();
#endif
const int ith = params->ith;
const int nth = params->nth;
-#ifdef GGML_USE_CLBLAST
+#if defined(GGML_USE_CLBLAST)
if (src1->backend == GGML_BACKEND_GPU) {
// TODO: OpenCL kernel support full broadcast
GGML_ASSERT(ggml_can_repeat_rows(src1, src0));
}
GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
+#elif defined(GGML_USE_VULKAN)
+ const bool skip_cpu = ggml_vk_compute_forward(params, tensor);
+#ifdef GGML_VULKAN_CHECK_RESULTS
+ if (skip_cpu) {
+ ggml_vk_check_results_1(params, tensor);
+ }
+#endif
+ if (skip_cpu) {
+ return;
+ }
+ GGML_ASSERT(tensor->src[0] == NULL || tensor->src[0]->backend == GGML_BACKEND_CPU);
+ GGML_ASSERT(tensor->src[1] == NULL || tensor->src[1]->backend == GGML_BACKEND_CPU);
#endif // GGML_USE_CUBLAS
#ifdef GGML_USE_SYCL
}
}
+#ifdef GGML_USE_VULKAN
+ for (int i = 0; i < cgraph->n_nodes; i++) {
+ ggml_vk_preallocate_buffers_graph(cgraph->nodes[i]);
+ }
+ ggml_vk_preallocate_buffers();
+
+ for (int i = 0; i < cgraph->n_nodes; i++) {
+ ggml_vk_build_graph(cgraph->nodes[i], i == cgraph->n_nodes - 1);
+ }
+#endif
+
const int n_threads = cplan->n_threads;
struct ggml_compute_state_shared state_shared = {
}
}
+#ifdef GGML_USE_VULKAN
+ ggml_vk_graph_cleanup();
+#endif
+
// performance stats (graph)
{
int64_t perf_cycles_cur = ggml_perf_cycles() - perf_start_cycles;
}
int ggml_cpu_has_blas(void) {
-#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
+#if defined(GGML_USE_ACCELERATE) || defined(GGML_USE_OPENBLAS) || defined(GGML_USE_CUBLAS) || defined(GGML_USE_VULKAN) || defined(GGML_USE_CLBLAST) || defined(GGML_USE_SYCL)
return 1;
#else
return 0;
#endif
}
+int ggml_cpu_has_vulkan(void) {
+#if defined(GGML_USE_VULKAN)
+ return 1;
+#else
+ return 0;
+#endif
+}
+
int ggml_cpu_has_sycl(void) {
#if defined(GGML_USE_SYCL)
return 1;
}
int ggml_cpu_has_gpublas(void) {
- return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_sycl();
+ return ggml_cpu_has_cublas() || ggml_cpu_has_clblast() || ggml_cpu_has_vulkan() || ggml_cpu_has_sycl();
}
int ggml_cpu_has_sse3(void) {
GGML_API int ggml_cpu_has_blas (void);
GGML_API int ggml_cpu_has_cublas (void);
GGML_API int ggml_cpu_has_clblast (void);
+ GGML_API int ggml_cpu_has_vulkan (void);
GGML_API int ggml_cpu_has_gpublas (void);
GGML_API int ggml_cpu_has_sse3 (void);
GGML_API int ggml_cpu_has_ssse3 (void);