#include <string>
#include <thread>
#include <vector>
+#include <unordered_set>
#include "common.h"
#include "ggml.h"
return join(gpu_list, ", ");
}
+static std::vector<ggml_backend_dev_t> parse_devices_arg(const std::string & value) {
+ std::vector<ggml_backend_dev_t> devices;
+ std::string trimmed = string_strip(value);
+ if (trimmed.empty()) {
+ throw std::invalid_argument("no devices specified");
+ }
+ if (trimmed == "auto") {
+ return devices;
+ }
+
+ auto dev_names = string_split<std::string>(trimmed, '/');
+ if (dev_names.size() == 1 && string_strip(dev_names[0]) == "none") {
+ devices.push_back(nullptr);
+ return devices;
+ }
+
+ for (auto & name : dev_names) {
+ std::string dev_name = string_strip(name);
+ if (dev_name.empty()) {
+ throw std::invalid_argument("invalid device specification");
+ }
+ auto * dev = ggml_backend_dev_by_name(dev_name.c_str());
+ if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) {
+ throw std::invalid_argument(string_format("invalid device: %s", dev_name.c_str()));
+ }
+ devices.push_back(dev);
+ }
+
+ devices.push_back(nullptr);
+ return devices;
+}
+
+static std::vector<ggml_backend_dev_t> register_rpc_device_list(const std::string & servers) {
+ auto rpc_servers = string_split<std::string>(servers, ',');
+ if (rpc_servers.empty()) {
+ throw std::invalid_argument("no RPC servers specified");
+ }
+
+ auto * rpc_reg = ggml_backend_reg_by_name("RPC");
+ if (!rpc_reg) {
+ throw std::invalid_argument("failed to find RPC backend");
+ }
+
+ using add_rpc_device_fn = ggml_backend_dev_t (*)(const char * endpoint);
+ auto * ggml_backend_rpc_add_device_fn = (add_rpc_device_fn) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
+ if (!ggml_backend_rpc_add_device_fn) {
+ throw std::invalid_argument("failed to find RPC device add function");
+ }
+
+ static std::unordered_set<std::string> registered;
+ std::vector<ggml_backend_dev_t> devices;
+ for (const auto & server : rpc_servers) {
+ ggml_backend_dev_t dev = nullptr;
+
+ std::string name = string_format("RPC[%s]", server.c_str());
+
+ if (registered.find(server) != registered.end()) {
+ dev = ggml_backend_dev_by_name(name.c_str());
+ }
+
+ if (!dev) {
+ dev = ggml_backend_rpc_add_device_fn(server.c_str());
+ if (!dev) {
+ throw std::invalid_argument(string_format("failed to add RPC device for server '%s'", server.c_str()));
+ }
+ ggml_backend_device_register(dev);
+ registered.insert(server);
+ }
+
+ devices.push_back(dev);
+ }
+
+ return devices;
+}
+
+static std::string devices_to_string(const std::vector<ggml_backend_dev_t> & devices) {
+ if (devices.empty()) {
+ return "auto";
+ }
+
+ if (devices.size() == 1 && devices[0] == nullptr) {
+ return "none";
+ }
+
+ std::vector<std::string> names;
+ for (auto * dev : devices) {
+ if (dev == nullptr) {
+ break;
+ }
+ names.push_back(ggml_backend_dev_name(dev));
+ }
+
+ return join(names, "/");
+}
+
// command line params
enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL };
std::vector<int> poll;
std::vector<int> n_gpu_layers;
std::vector<int> n_cpu_moe;
- std::vector<std::string> rpc_servers;
std::vector<llama_split_mode> split_mode;
std::vector<int> main_gpu;
std::vector<bool> no_kv_offload;
std::vector<bool> flash_attn;
+ std::vector<std::vector<ggml_backend_dev_t>> devices;
std::vector<std::vector<float>> tensor_split;
std::vector<std::vector<llama_model_tensor_buft_override>> tensor_buft_overrides;
std::vector<bool> use_mmap;
/* poll */ { 50 },
/* n_gpu_layers */ { 99 },
/* n_cpu_moe */ { 0 },
- /* rpc_servers */ { "" },
/* split_mode */ { LLAMA_SPLIT_MODE_LAYER },
/* main_gpu */ { 0 },
/* no_kv_offload */ { false },
/* flash_attn */ { false },
+ /* devices */ { {} },
/* tensor_split */ { std::vector<float>(llama_max_devices(), 0.0f) },
/* tensor_buft_overrides*/ { std::vector<llama_model_tensor_buft_override>{ { nullptr, nullptr } } },
/* use_mmap */ { true },
output_format_str(cmd_params_defaults.output_format));
printf(" -oe, --output-err <csv|json|jsonl|md|sql> output format printed to stderr (default: %s)\n",
output_format_str(cmd_params_defaults.output_format_stderr));
+ printf(" --list-devices list available devices and exit\n");
printf(" -v, --verbose verbose output\n");
printf(" --progress print test progress indicators\n");
printf(" --no-warmup skip warmup runs before benchmarking\n");
+ if (llama_supports_rpc()) {
+ printf(" -rpc, --rpc <rpc_servers> register RPC devices (comma separated)\n");
+ }
printf("\n");
printf("test parameters:\n");
printf(" -m, --model <filename> (default: %s)\n", join(cmd_params_defaults.model, ",").c_str());
join(cmd_params_defaults.n_gpu_layers, ",").c_str());
printf(" -ncmoe, --n-cpu-moe <n> (default: %s)\n",
join(cmd_params_defaults.n_cpu_moe, ",").c_str());
- if (llama_supports_rpc()) {
- printf(" -rpc, --rpc <rpc_servers> (default: %s)\n",
- join(cmd_params_defaults.rpc_servers, ",").c_str());
- }
printf(" -sm, --split-mode <none|layer|row> (default: %s)\n",
join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str());
printf(" -mg, --main-gpu <i> (default: %s)\n",
join(cmd_params_defaults.no_kv_offload, ",").c_str());
printf(" -fa, --flash-attn <0|1> (default: %s)\n",
join(cmd_params_defaults.flash_attn, ",").c_str());
+ printf(" -dev, --device <dev0/dev1/...> (default: auto)\n");
printf(" -mmp, --mmap <0|1> (default: %s)\n",
join(cmd_params_defaults.use_mmap, ",").c_str());
printf(" -embd, --embeddings <0|1> (default: %s)\n",
break;
}
params.type_v.insert(params.type_v.end(), types.begin(), types.end());
+ } else if (arg == "-dev" || arg == "--device") {
+ if (++i >= argc) {
+ invalid_param = true;
+ break;
+ }
+ auto combos = string_split<std::string>(argv[i], split_delim);
+ for (const auto & combo : combos) {
+ try {
+ params.devices.push_back(parse_devices_arg(combo));
+ } catch (const std::exception & e) {
+ fprintf(stderr, "error: %s\n", e.what());
+ invalid_param = true;
+ break;
+ }
+ }
+ if (invalid_param) {
+ break;
+ }
+ } else if (arg == "--list-devices") {
+ std::vector<ggml_backend_dev_t> devices;
+ for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
+ auto * dev = ggml_backend_dev_get(i);
+ if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) {
+ devices.push_back(dev);
+ }
+ }
+ printf("Available devices:\n");
+ if (devices.empty()) {
+ printf(" (none)\n");
+ }
+ for (auto * dev : devices) {
+ size_t free, total;
+ ggml_backend_dev_memory(dev, &free, &total);
+ printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024);
+ }
+ exit(0);
} else if (arg == "-t" || arg == "--threads") {
if (++i >= argc) {
invalid_param = true;
invalid_param = true;
break;
}
- params.rpc_servers.push_back(argv[i]);
+ try {
+ register_rpc_device_list(argv[i]);
+ } catch (const std::exception & e) {
+ fprintf(stderr, "error: %s\n", e.what());
+ invalid_param = true;
+ break;
+ }
} else if (arg == "-sm" || arg == "--split-mode") {
if (++i >= argc) {
invalid_param = true;
if (params.n_cpu_moe.empty()) {
params.n_cpu_moe = cmd_params_defaults.n_cpu_moe;
}
- if (params.rpc_servers.empty()) {
- params.rpc_servers = cmd_params_defaults.rpc_servers;
- }
if (params.split_mode.empty()) {
params.split_mode = cmd_params_defaults.split_mode;
}
if (params.flash_attn.empty()) {
params.flash_attn = cmd_params_defaults.flash_attn;
}
+ if (params.devices.empty()) {
+ params.devices = cmd_params_defaults.devices;
+ }
if (params.tensor_split.empty()) {
params.tensor_split = cmd_params_defaults.tensor_split;
}
int poll;
int n_gpu_layers;
int n_cpu_moe;
- std::string rpc_servers_str;
llama_split_mode split_mode;
int main_gpu;
bool no_kv_offload;
bool flash_attn;
+ std::vector<ggml_backend_dev_t> devices;
std::vector<float> tensor_split;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
bool use_mmap;
llama_model_params mparams = llama_model_default_params();
mparams.n_gpu_layers = n_gpu_layers;
- if (!rpc_servers_str.empty()) {
- auto rpc_servers = string_split<std::string>(rpc_servers_str, ',');
-
- // add RPC devices
- if (!rpc_servers.empty()) {
- ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC");
- if (!rpc_reg) {
- fprintf(stderr, "%s: failed to find RPC backend\n", __func__);
- exit(1);
- }
-
- typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint);
- ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device");
- if (!ggml_backend_rpc_add_device_fn) {
- fprintf(stderr, "%s: failed to find RPC device add function\n", __func__);
- exit(1);
- }
- static std::vector<ggml_backend_dev_t> devices;
- devices.clear();
- // RPC devices should always come first for performance reasons
- for (const std::string & server : rpc_servers) {
- ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str());
- if (dev) {
- devices.push_back(dev);
- } else {
- fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str());
- exit(1);
- }
- }
- // FIXME: use llama.cpp device selection logic
- // add local GPU devices if any
- for (size_t i = 0; i < ggml_backend_dev_count(); ++i) {
- ggml_backend_dev_t dev = ggml_backend_dev_get(i);
- switch (ggml_backend_dev_type(dev)) {
- case GGML_BACKEND_DEVICE_TYPE_CPU:
- case GGML_BACKEND_DEVICE_TYPE_ACCEL:
- // skip CPU backends since they are handled separately
- break;
-
- case GGML_BACKEND_DEVICE_TYPE_GPU:
- devices.push_back(dev);
- break;
-
- case GGML_BACKEND_DEVICE_TYPE_IGPU:
- // iGPUs are not used when there are RPC servers
- break;
- }
- }
- devices.push_back(nullptr);
- mparams.devices = devices.data();
- }
+ if (!devices.empty()) {
+ mparams.devices = const_cast<ggml_backend_dev_t *>(devices.data());
}
mparams.split_mode = split_mode;
mparams.main_gpu = main_gpu;
bool equal_mparams(const cmd_params_instance & other) const {
return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe &&
- rpc_servers_str == other.rpc_servers_str && split_mode == other.split_mode &&
+ split_mode == other.split_mode &&
main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split &&
+ devices == other.devices &&
vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides);
}
for (const auto & m : params.model)
for (const auto & nl : params.n_gpu_layers)
for (const auto & ncmoe : params.n_cpu_moe)
- for (const auto & rpc : params.rpc_servers)
for (const auto & sm : params.split_mode)
for (const auto & mg : params.main_gpu)
+ for (const auto & devs : params.devices)
for (const auto & ts : params.tensor_split)
for (const auto & ot : params.tensor_buft_overrides)
for (const auto & mmp : params.use_mmap)
/* .poll = */ pl,
/* .n_gpu_layers = */ nl,
/* .n_cpu_moe = */ ncmoe,
- /* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
+ /* .devices = */ devs,
/* .tensor_split = */ ts,
/* .tensor_buft_overrides = */ ot,
/* .use_mmap = */ mmp,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl,
/* .n_cpu_moe = */ ncmoe,
- /* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
+ /* .devices = */ devs,
/* .tensor_split = */ ts,
/* .tensor_buft_overrides = */ ot,
/* .use_mmap = */ mmp,
/* .poll = */ pl,
/* .n_gpu_layers = */ nl,
/* .n_cpu_moe = */ ncmoe,
- /* .rpc_servers = */ rpc,
/* .split_mode = */ sm,
/* .main_gpu = */ mg,
/* .no_kv_offload= */ nkvo,
/* .flash_attn = */ fa,
+ /* .devices = */ devs,
/* .tensor_split = */ ts,
/* .tensor_buft_overrides = */ ot,
/* .use_mmap = */ mmp,
int main_gpu;
bool no_kv_offload;
bool flash_attn;
+ std::vector<ggml_backend_dev_t> devices;
std::vector<float> tensor_split;
std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
bool use_mmap;
main_gpu = inst.main_gpu;
no_kv_offload = inst.no_kv_offload;
flash_attn = inst.flash_attn;
+ devices = inst.devices;
tensor_split = inst.tensor_split;
tensor_buft_overrides = inst.tensor_buft_overrides;
use_mmap = inst.use_mmap;
static const std::vector<std::string> & get_fields() {
static const std::vector<std::string> fields = {
- "build_commit", "build_number", "cpu_info", "gpu_info", "backends",
- "model_filename", "model_type", "model_size", "model_n_params", "n_batch",
- "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll",
- "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode",
- "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides",
- "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen",
- "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts",
- "stddev_ts"
+ "build_commit", "build_number", "cpu_info", "gpu_info", "backends",
+ "model_filename", "model_type", "model_size", "model_n_params", "n_batch",
+ "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll",
+ "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode",
+ "main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split",
+ "tensor_buft_overrides", "use_mmap", "embeddings", "no_op_offload",
+ "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns",
+ "stddev_ns", "avg_ts", "stddev_ts"
};
return fields;
}
std::to_string(main_gpu),
std::to_string(no_kv_offload),
std::to_string(flash_attn),
+ devices_to_string(devices),
tensor_split_str,
tensor_buft_overrides_str,
std::to_string(use_mmap),
if (field == "flash_attn") {
return 2;
}
+ if (field == "devices") {
+ return -12;
+ }
if (field == "use_mmap") {
return 4;
}
if (field == "no_op_offload") {
return "nopo";
}
+ if (field == "devices") {
+ return "dev";
+ }
if (field == "tensor_split") {
return "ts";
}
if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) {
fields.emplace_back("flash_attn");
}
+ if (params.devices.size() > 1 || params.devices != cmd_params_defaults.devices) {
+ fields.emplace_back("devices");
+ }
if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) {
fields.emplace_back("tensor_split");
}