printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
-#ifdef GGML_USE_RPC
- printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
-#endif
+ if (llama_supports_rpc()) {
+ printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
+ }
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
}
auto p = string_split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
-#ifdef GGML_USE_RPC
- } else if (arg == "-rpc" || arg == "--rpc") {
+ } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rpc_servers.push_back(argv[i]);
-#endif
} else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) {
invalid_param = true;
# include <netdb.h>
# include <unistd.h>
#endif
-#include <string.h>
+#include <cstring>
#define UNUSED GGML_UNUSED
return (enum ggml_status)output[0];
}
-static bool ggml_backend_rpc_supports_op(ggml_backend_t backend, const ggml_tensor * op) {
- UNUSED(backend);
- UNUSED(op);
- //TODO: call the remote backend and cache the results
- return true;
-}
-
-static bool ggml_backend_rpc_supports_buft(ggml_backend_t backend, ggml_backend_buffer_type_t buft) {
- if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
- return false;
- }
- ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
- ggml_backend_rpc_context * rpc_ctx = (ggml_backend_rpc_context *)backend->context;
- return buft_ctx->endpoint == rpc_ctx->endpoint;
-}
-
static ggml_backend_i ggml_backend_rpc_interface = {
/* .get_name = */ ggml_backend_rpc_name,
/* .free = */ ggml_backend_rpc_free,
/* .graph_plan_update = */ NULL,
/* .graph_plan_compute = */ NULL,
/* .graph_compute = */ ggml_backend_rpc_graph_compute,
- /* .supports_op = */ ggml_backend_rpc_supports_op,
- /* .supports_buft = */ ggml_backend_rpc_supports_buft,
+ /* .supports_op = */ NULL,
+ /* .supports_buft = */ NULL,
/* .offload_op = */ NULL,
/* .event_record = */ NULL,
/* .event_wait = */ NULL,
ggml_backend_buffer_type_t buft = new ggml_backend_buffer_type {
/* .iface = */ ggml_backend_rpc_buffer_type_interface,
- /* .device = */ nullptr,
+ /* .device = */ ggml_backend_rpc_add_device(endpoint),
/* .context = */ buft_ctx
};
buft_map[endpoint] = buft;
ggml_backend_t backend = new ggml_backend {
/* .guid = */ ggml_backend_rpc_guid(),
/* .interface = */ ggml_backend_rpc_interface,
- /* .device = */ nullptr,
+ /* .device = */ ggml_backend_rpc_add_device(endpoint),
/* .context = */ ctx
};
return backend;
}
}
-void start_rpc_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) {
+void ggml_backend_rpc_start_server(ggml_backend_t backend, const char * endpoint, size_t free_mem, size_t total_mem) {
std::string host;
int port;
if (!parse_endpoint(endpoint, host, port)) {
WSACleanup();
#endif
}
+
+// device interface
+
+struct ggml_backend_rpc_device_context {
+ std::string endpoint;
+ std::string name;
+};
+
+static const char * ggml_backend_rpc_device_get_name(ggml_backend_dev_t dev) {
+ ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
+
+ return ctx->name.c_str();
+}
+
+static const char * ggml_backend_rpc_device_get_description(ggml_backend_dev_t dev) {
+ ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
+
+ return ctx->name.c_str();
+}
+
+static void ggml_backend_rpc_device_get_memory(ggml_backend_dev_t dev, size_t * free, size_t * total) {
+ ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
+
+ ggml_backend_rpc_get_device_memory(ctx->endpoint.c_str(), free, total);
+
+ UNUSED(dev);
+}
+
+static enum ggml_backend_dev_type ggml_backend_rpc_device_get_type(ggml_backend_dev_t dev) {
+ // TODO: obtain value from the server
+ return GGML_BACKEND_DEVICE_TYPE_GPU_FULL;
+
+ UNUSED(dev);
+}
+
+static void ggml_backend_rpc_device_get_props(ggml_backend_dev_t dev, struct ggml_backend_dev_props * props) {
+ props->name = ggml_backend_rpc_device_get_name(dev);
+ props->description = ggml_backend_rpc_device_get_description(dev);
+ props->type = ggml_backend_rpc_device_get_type(dev);
+ ggml_backend_rpc_device_get_memory(dev, &props->memory_free, &props->memory_total);
+ props->caps = {
+ /* .async = */ false,
+ /* .host_buffer = */ false,
+ /* .buffer_from_host_ptr = */ false,
+ /* .events = */ false,
+ };
+}
+
+static ggml_backend_t ggml_backend_rpc_device_init(ggml_backend_dev_t dev, const char * params) {
+ ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
+
+ return ggml_backend_rpc_init(ctx->endpoint.c_str());
+
+ UNUSED(params);
+}
+
+static ggml_backend_buffer_type_t ggml_backend_rpc_device_get_buffer_type(ggml_backend_dev_t dev) {
+ ggml_backend_rpc_device_context * ctx = (ggml_backend_rpc_device_context *)dev->context;
+
+ return ggml_backend_rpc_buffer_type(ctx->endpoint.c_str());
+
+ UNUSED(dev);
+}
+
+static ggml_backend_buffer_t ggml_backend_rpc_device_buffer_from_ptr(ggml_backend_dev_t dev, void * ptr, size_t size, size_t max_tensor_size) {
+ return ggml_backend_cpu_buffer_from_ptr(ptr, size);
+
+ UNUSED(dev);
+ UNUSED(max_tensor_size);
+}
+
+static bool ggml_backend_rpc_device_supports_op(ggml_backend_dev_t dev, const struct ggml_tensor * op) {
+ UNUSED(dev);
+ UNUSED(op);
+ //TODO: call the remote backend and cache the results
+ return true;
+}
+
+static bool ggml_backend_rpc_device_supports_buft(ggml_backend_dev_t dev, ggml_backend_buffer_type_t buft) {
+ if (!buft || buft->iface.get_name != ggml_backend_rpc_buffer_type_name) {
+ return false;
+ }
+ ggml_backend_rpc_buffer_type_context * buft_ctx = (ggml_backend_rpc_buffer_type_context *)buft->context;
+ ggml_backend_rpc_device_context * dev_ctx = (ggml_backend_rpc_device_context *)dev->context;
+ return buft_ctx->endpoint == dev_ctx->endpoint;
+}
+
+static const struct ggml_backend_device_i ggml_backend_rpc_device_i = {
+ /* .get_name = */ ggml_backend_rpc_device_get_name,
+ /* .get_description = */ ggml_backend_rpc_device_get_description,
+ /* .get_memory = */ ggml_backend_rpc_device_get_memory,
+ /* .get_type = */ ggml_backend_rpc_device_get_type,
+ /* .get_props = */ ggml_backend_rpc_device_get_props,
+ /* .init_backend = */ ggml_backend_rpc_device_init,
+ /* .get_buffer_type = */ ggml_backend_rpc_device_get_buffer_type,
+ /* .get_host_buffer_type = */ NULL,
+ /* .buffer_from_host_ptr = */ ggml_backend_rpc_device_buffer_from_ptr,
+ /* .supports_op = */ ggml_backend_rpc_device_supports_op,
+ /* .supports_buft = */ ggml_backend_rpc_device_supports_buft,
+ /* .offload_op = */ NULL,
+ /* .event_new = */ NULL,
+ /* .event_free = */ NULL,
+ /* .event_synchronize = */ NULL,
+};
+
+// backend reg interface
+
+static const char * ggml_backend_rpc_reg_get_name(ggml_backend_reg_t reg) {
+ return "RPC";
+
+ UNUSED(reg);
+}
+
+static size_t ggml_backend_rpc_reg_get_device_count(ggml_backend_reg_t reg) {
+ return 0;
+
+ UNUSED(reg);
+}
+
+static ggml_backend_dev_t ggml_backend_rpc_reg_get_device(ggml_backend_reg_t reg, size_t index) {
+ GGML_ABORT("The RPC backend does not have enumerated devices - use ggml_backend_add_device instead");
+
+ UNUSED(reg);
+ UNUSED(index);
+}
+
+static void * ggml_backend_rpc_get_proc_address(ggml_backend_reg_t reg, const char * name) {
+ if (std::strcmp(name, "ggml_backend_rpc_add_device") == 0) {
+ return (void *)ggml_backend_rpc_add_device;
+ }
+ return NULL;
+
+ UNUSED(reg);
+}
+
+static const struct ggml_backend_reg_i ggml_backend_rpc_reg_i = {
+ /* .get_name = */ ggml_backend_rpc_reg_get_name,
+ /* .get_device_count = */ ggml_backend_rpc_reg_get_device_count,
+ /* .get_device = */ ggml_backend_rpc_reg_get_device,
+ /* .get_proc_address = */ ggml_backend_rpc_get_proc_address,
+};
+
+ggml_backend_reg_t ggml_backend_rpc_reg(void) {
+ static struct ggml_backend_reg ggml_backend_rpc_reg = {
+ /* .iface = */ ggml_backend_rpc_reg_i,
+ /* .context = */ NULL,
+ };
+
+ return &ggml_backend_rpc_reg;
+}
+
+ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint) {
+ static std::unordered_map<std::string, ggml_backend_dev_t> dev_map;
+
+ static std::mutex mutex;
+ std::lock_guard<std::mutex> lock(mutex);
+
+ if (dev_map.find(endpoint) != dev_map.end()) {
+ return dev_map[endpoint];
+ }
+
+ ggml_backend_rpc_device_context * ctx = new ggml_backend_rpc_device_context {
+ /* .endpoint = */ endpoint,
+ /* .name = */ "RPC[" + std::string(endpoint) + "]",
+ };
+
+ ggml_backend_dev_t dev = new ggml_backend_device {
+ /* .iface = */ ggml_backend_rpc_device_i,
+ /* .reg = */ ggml_backend_rpc_reg(),
+ /* .context = */ ctx,
+ };
+
+ dev_map[endpoint] = dev;
+
+ return dev;
+}
#include "ggml-alloc.h"
#include "ggml-backend.h"
-#ifdef GGML_USE_RPC
-# include "ggml-rpc.h"
-#endif
-
#if defined(GGML_USE_VULKAN)
# include "ggml-vulkan.h"
#elif defined(GGML_USE_SYCL)
static int llama_get_device_count(const llama_model & model) {
int count = (int) model.devices.size();
-#if defined(GGML_USE_RPC)
- count += (int) model.rpc_servers.size();
-#endif
-
#if defined(GGML_USE_SYCL)
count += ggml_backend_sycl_get_device_count();
#elif defined(GGML_USE_VULKAN)
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int device) {
ggml_backend_buffer_type_t buft = nullptr;
-#if defined(GGML_USE_RPC)
- int rpc_count = (int)model.rpc_servers.size();
- if (device < rpc_count) {
- const char * endpoint = model.rpc_servers[device].c_str();
- return ggml_backend_rpc_buffer_type(endpoint);
- }
- device -= rpc_count;
-#endif
-
if (device < (int)model.devices.size()) {
return ggml_backend_dev_buffer_type(model.devices[device]);
}
}
static size_t llama_get_device_memory(const llama_model & model, int device) {
-#if defined(GGML_USE_RPC)
- int rpc_count = (int)model.rpc_servers.size();
- if (device < rpc_count) {
- size_t total;
- size_t free;
- const char * endpoint = model.rpc_servers[device].c_str();
- ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
- return free;
- }
- device = device - rpc_count;
-#endif
-
if (device < (int)model.devices.size()) {
ggml_backend_dev_t dev = model.devices[device];
size_t total;
bool llama_supports_gpu_offload(void) {
#if defined(GGML_USE_VULKAN) || \
- defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE) || defined(GGML_USE_RPC)
+ defined(GGML_USE_SYCL) || defined(GGML_USE_KOMPUTE)
// Defined when llama.cpp is compiled with support for offloading model layers to GPU.
return true;
#else
return ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU) != nullptr ||
- ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr;
+ ggml_backend_dev_by_type(GGML_BACKEND_DEVICE_TYPE_GPU_FULL) != nullptr ||
+ llama_supports_rpc();
#endif
}
+bool llama_supports_rpc(void) {
+ return ggml_backend_reg_by_name("RPC") != nullptr;
+}
+
void llama_backend_init(void) {
ggml_time_init();
model->rpc_servers.push_back(servers);
}
+ // add RPC devices
+ if (!model->rpc_servers.empty()) {
+ ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+ if (!rpc_reg) {
+ LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
+ llama_free_model(model);
+ return nullptr;
+ }
+
+ // ggml_backend_dev_t ggml_backend_rpc_add_device(const char * endpoint);
+ using ggml_backend_rpc_add_device_t = ggml_backend_dev_t (*)(const char *);
+ ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
+ if (!ggml_backend_rpc_add_device_fn) {
+ LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
+ llama_free_model(model);
+ return nullptr;
+ }
+
+ for (const std::string & server : model->rpc_servers) {
+ ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
+ if (dev) {
+ model->devices.push_back(dev);
+ } else {
+ LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
+ llama_free_model(model);
+ return nullptr;
+ }
+ }
+ }
+
// create list of devices to use with this model
// currently, we use all available devices
// TODO: rework API to give user more control over device selection
} else if (status == -2) {
LLAMA_LOG_INFO("%s: cancelled model load\n", __func__);
}
- delete model;
+ llama_free_model(model);
return nullptr;
}
main_gpu -= (int)model->devices.size();
}
-#if defined(GGML_USE_RPC)
- if (model->n_gpu_layers > 0) {
- for (const auto & endpoint : model->rpc_servers) {
- ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
- if (backend == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
- llama_free(ctx);
- return nullptr;
- }
- ctx->backends.push_back(backend);
- }
- }
- if (main_gpu >= (int)model->rpc_servers.size()) {
- main_gpu -= (int)model->rpc_servers.size();
- }
-#endif
-
#if defined(GGML_USE_VULKAN)
if (model->split_mode == LLAMA_SPLIT_MODE_ROW) {
LLAMA_LOG_ERROR("%s: Row split not supported. Failed to initialize Vulkan backend\n", __func__);