return devices;
}
+static void add_rpc_devices(std::string servers) {
+ auto rpc_servers = string_split<std::string>(servers, ',');
+ if (rpc_servers.empty()) {
+ throw std::invalid_argument("no RPC servers specified");
+ }
+ ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+ if (!rpc_reg) {
+ throw std::invalid_argument("failed to find RPC backend");
+ }
+ typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
+ ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
+ if (!ggml_backend_rpc_add_device_fn) {
+ throw std::invalid_argument("failed to find RPC device add function");
+ }
+ for (const auto & server : rpc_servers) {
+ ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
+ if (dev) {
+ ggml_backend_device_register(dev);
+ } else {
+ throw std::invalid_argument("failed to register RPC device");
+ }
+ }
+}
+
bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **)) {
auto ctx_arg = common_params_parser_init(params, ex, print_usage);
const common_params params_org = ctx_arg.params; // the example can modify the default params
{"--rpc"}, "SERVERS",
"comma separated list of RPC servers",
[](common_params & params, const std::string & value) {
- params.rpc_servers = value;
+ add_rpc_devices(value);
+ GGML_UNUSED(params);
}
).set_env("LLAMA_ARG_RPC"));
}
if (params.n_gpu_layers != -1) {
mparams.n_gpu_layers = params.n_gpu_layers;
}
- mparams.rpc_servers = params.rpc_servers.c_str();
mparams.main_gpu = params.main_gpu;
mparams.split_mode = params.split_mode;
mparams.tensor_split = params.tensor_split;
std::string lookup_cache_static = ""; // path of static ngram cache file for lookup decoding // NOLINT
std::string lookup_cache_dynamic = ""; // path of dynamic ngram cache file for lookup decoding // NOLINT
std::string logits_file = ""; // file for saving *all* logits // NOLINT
- std::string rpc_servers = ""; // comma separated list of RPC servers // NOLINT
std::vector<std::string> in_files; // all input files
std::vector<std::string> antiprompt; // strings upon which more user input is prompted (a.k.a. reverse prompts)
bool cpu_strict;
int poll;
int n_gpu_layers;
- std::string rpc_servers;
+ std::string rpc_servers_str;
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
llama_model_params mparams = llama_model_default_params();
mparams.n_gpu_layers = n_gpu_layers;
- if (!rpc_servers.empty()) {
- mparams.rpc_servers = rpc_servers.c_str();
+ if (!rpc_servers_str.empty()) {
+ auto rpc_servers = string_split<std::string>(rpc_servers_str, ',');
+
+ // add RPC devices
+ if (!rpc_servers.empty()) {
+ ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
+ if (!rpc_reg) {
+ fprintf(stderr, "%s: failed to find RPC backend\n", __func__);
+ exit(1);
+ }
+
+ typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
+ ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
+ if (!ggml_backend_rpc_add_device_fn) {
+ fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
+ exit(1);
+ }
+ static std::vector<ggml_backend_dev_t> devices;
+ devices.clear();
+ for (const std::string & server : rpc_servers) {
+ ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
+ if (dev) {
+ devices.push_back(dev);
+ } else {
+ fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
+ exit(1);
+ }
+ }
+ devices.push_back(nullptr);
+ mparams.devices = devices.data();
+ }
}
mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu;
}
bool equal_mparams(const cmd_params_instance & other) const {
- return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers == other.rpc_servers &&
+ return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str &&
split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap &&
tensor_split == other.tensor_split;
}
// Backend registry
//
+ GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
+
// Backend (reg) enumeration
GGML_API size_t ggml_backend_reg_count(void);
GGML_API ggml_backend_reg_t ggml_backend_reg_get(size_t index);
// Internal backend registry API
GGML_API void ggml_backend_register(ggml_backend_reg_t reg);
- GGML_API void ggml_backend_device_register(ggml_backend_dev_t device);
// Add backend dynamic loading support to the backend
// proportion of the model (layers or rows) to offload to each GPU, size: llama_max_devices()
const float * tensor_split;
- // comma separated list of RPC servers to use for offloading
- const char * rpc_servers;
-
// Called with a progress value between 0.0 and 1.0. Pass NULL to disable.
// If the provided progress_callback returns true, model loading continues.
// If it returns false, model loading is immediately aborted.
/*.split_mode =*/ LLAMA_SPLIT_MODE_LAYER,
/*.main_gpu =*/ 0,
/*.tensor_split =*/ nullptr,
- /*.rpc_servers =*/ nullptr,
/*.progress_callback =*/ nullptr,
/*.progress_callback_user_data =*/ nullptr,
/*.kv_overrides =*/ nullptr,
// gguf metadata
std::unordered_map<std::string, std::string> gguf_kv;
- std::vector<std::string> rpc_servers;
-
// list of devices used in this model
std::vector<ggml_backend_dev_t> devices;
};
}
- if (params.rpc_servers != nullptr && params.rpc_servers[0] != '\0') {
- // split the servers set them into model->rpc_servers
- std::string servers(params.rpc_servers);
- size_t pos = 0;
- while ((pos = servers.find(',')) != std::string::npos) {
- std::string server = servers.substr(0, pos);
- model->rpc_servers.push_back(server);
- servers.erase(0, pos + 1);
- }
- model->rpc_servers.push_back(servers);
- }
-
- // add RPC devices
- if (!model->rpc_servers.empty()) {
- ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
- if (!rpc_reg) {
- LLAMA_LOG_ERROR("%s: failed to find RPC backend\n", __func__);
- llama_model_free(model);
- return nullptr;
- }
-
- typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
- ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
- if (!ggml_backend_rpc_add_device_fn) {
- LLAMA_LOG_ERROR("%s: failed to find RPC device add function\n", __func__);
- llama_model_free(model);
- return nullptr;
- }
-
- for (const std::string & server : model->rpc_servers) {
- ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
- if (dev) {
- model->devices.push_back(dev);
- } else {
- LLAMA_LOG_ERROR("%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
- llama_model_free(model);
- return nullptr;
- }
- }
- }
-
// create list of devices to use with this model
if (params.devices) {
for (ggml_backend_dev_t * dev = params.devices; *dev; ++dev) {