printf(" --cpu-strict <0|1> (default: %s)\n", join(cmd_params_defaults.cpu_strict, ",").c_str());
printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str());
printf(" -ngl, --n-gpu-layers <n> (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str());
+#ifdef GGML_USE_RPC
printf(" -rpc, --rpc <rpc_servers> (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str());
+#endif
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n", join(cmd_params_defaults.main_gpu, ",").c_str());
printf(" -nkvo, --no-kv-offload <0|1> (default: %s)\n", join(cmd_params_defaults.no_kv_offload, ",").c_str());
}
auto p = string_split<int>(argv[i], split_delim);
params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end());
+#ifdef GGML_USE_RPC
} else if (arg == "-rpc" || arg == "--rpc") {
if (++i >= argc) {
invalid_param = true;
break;
}
params.rpc_servers.push_back(argv[i]);
+#endif
} else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) {
invalid_param = true;
static ggml_backend_buffer_type_t llama_default_buffer_type_offload(const llama_model & model, int gpu) {
ggml_backend_buffer_type_t buft = nullptr;
-#if defined(GGML_USE_RPC)
- int dev_count = (int)llama_get_device_count(model);
+#ifdef GGML_USE_RPC
int rpc_count = (int)model.rpc_servers.size();
- if (gpu >= dev_count - rpc_count) {
- const char * endpoint = model.rpc_servers[gpu - dev_count + rpc_count].c_str();
+#else
+ int rpc_count = 0;
+#endif
+ int local_gpu = gpu - rpc_count;
+#if defined(GGML_USE_RPC)
+ if (gpu < rpc_count) {
+ const char * endpoint = model.rpc_servers[gpu].c_str();
return ggml_backend_rpc_buffer_type(endpoint);
}
#endif
#if defined(GGML_USE_METAL)
buft = ggml_backend_metal_buffer_type();
#elif defined(GGML_USE_CUDA)
- buft = ggml_backend_cuda_buffer_type(gpu);
+ buft = ggml_backend_cuda_buffer_type(local_gpu);
#elif defined(GGML_USE_VULKAN)
- buft = ggml_backend_vk_buffer_type(gpu);
+ buft = ggml_backend_vk_buffer_type(local_gpu);
#elif defined(GGML_USE_SYCL)
- buft = ggml_backend_sycl_buffer_type(gpu);
+ buft = ggml_backend_sycl_buffer_type(local_gpu);
#elif defined(GGML_USE_KOMPUTE)
- buft = ggml_backend_kompute_buffer_type(gpu);
+ buft = ggml_backend_kompute_buffer_type(local_gpu);
if (buft == nullptr) {
- LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, gpu);
+ LLAMA_LOG_WARN("%s: cannot use GPU %d, check `vulkaninfo --summary`\n", __func__, local_gpu);
}
#elif defined(GGML_USE_CANN)
- buft = ggml_backend_cann_buffer_type(gpu);
+ buft = ggml_backend_cann_buffer_type(local_gpu);
#endif
if (buft == nullptr) {
}
return buft;
GGML_UNUSED(model);
- GGML_UNUSED(gpu);
+ GGML_UNUSED(local_gpu);
}
static ggml_backend_buffer_type_t llama_default_buffer_type_split(const llama_model & model, int fallback_gpu, const float * tensor_split) {
}
static size_t llama_get_device_memory(const llama_model & model, int device) {
-#if defined(GGML_USE_RPC)
- int dev_count = (int)llama_get_device_count(model);
+#ifdef GGML_USE_RPC
int rpc_count = (int)model.rpc_servers.size();
- if (device >= dev_count - rpc_count) {
+#else
+ int rpc_count = 0;
+#endif
+ int local_device = device - rpc_count;
+#if defined(GGML_USE_RPC)
+ if (device < rpc_count) {
size_t total;
size_t free;
- const char * endpoint = model.rpc_servers[device - dev_count + rpc_count].c_str();
+ const char * endpoint = model.rpc_servers[device].c_str();
ggml_backend_rpc_get_device_memory(endpoint, &free, &total);
return free;
}
#if defined(GGML_USE_CUDA)
size_t total;
size_t free;
- ggml_backend_cuda_get_device_memory(device, &free, &total);
+ ggml_backend_cuda_get_device_memory(local_device, &free, &total);
return free;
#elif defined(GGML_USE_SYCL)
size_t total;
size_t free;
- ggml_backend_sycl_get_device_memory(device, &free, &total);
+ ggml_backend_sycl_get_device_memory(local_device, &free, &total);
return free;
#elif defined(GGML_USE_VULKAN)
size_t total;
size_t free;
- ggml_backend_vk_get_device_memory(device, &free, &total);
+ ggml_backend_vk_get_device_memory(local_device, &free, &total);
return free;
#elif defined(GGML_USE_CANN)
size_t total;
size_t free;
- ggml_backend_cann_get_device_memory(device, &free, &total);
+ ggml_backend_cann_get_device_memory(local_device, &free, &total);
return free;
#else
return 1;
#endif
GGML_UNUSED(model);
- GGML_UNUSED(device);
+ GGML_UNUSED(local_device);
}
//
if (!hparams.vocab_only) {
// initialize backends
+#if defined(GGML_USE_RPC)
+ if (model->n_gpu_layers > 0) {
+ for (const auto & endpoint : model->rpc_servers) {
+ ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
+ if (backend == nullptr) {
+ LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
+ llama_free(ctx);
+ return nullptr;
+ }
+ ctx->backends.push_back(backend);
+ }
+ }
+#endif
+
#if defined(GGML_USE_METAL)
if (model->n_gpu_layers > 0) {
ctx->backend_metal = ggml_backend_metal_init();
}
#endif
-#if defined(GGML_USE_RPC)
- if (model->n_gpu_layers > 0) {
- for (const auto & endpoint : model->rpc_servers) {
- ggml_backend_t backend = ggml_backend_rpc_init(endpoint.c_str());
- if (backend == nullptr) {
- LLAMA_LOG_ERROR("%s: failed to initialize RPC to '%s'\n", __func__, endpoint.c_str());
- llama_free(ctx);
- return nullptr;
- }
- ctx->backends.push_back(backend);
- }
- }
-#endif
ctx->backend_cpu = ggml_backend_cpu_init();
if (ctx->backend_cpu == nullptr) {
LLAMA_LOG_ERROR("%s: failed to initialize CPU backend\n", __func__);