From: ssweens Date: Fri, 19 Sep 2025 22:15:21 +0000 (-0700) Subject: llama-bench: add --devices and --list-devices support (#16039) X-Git-Tag: upstream/0.0.6527~5 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=be79d9fdd95ab8955527c4aaa67b90e8b9516718;p=pkg%2Fggml%2Fsources%2Fllama.cpp llama-bench: add --devices and --list-devices support (#16039) * * llama-bench: add --devices support - Support --devices same as llama-server - Provide for benchmarking different device combinations - Include --list-devices like llama-server for convenience * fix: field display ordering restored * fix: integrated the rpc devices - aimed to mimic the server as much as possible * cleanup: defaults for list-devices - handle dup device listing with RPC * cleanup: remove dup device load calls * docs: update llama-bench - added the recently added n-cpu-moe option to the docs while in there * llama-bench: rpc device simplification * rpc servers unify with other devices earlier, simplifying code * --list-devices made stateless and simpler * various cleanup --- diff --git a/tools/llama-bench/README.md b/tools/llama-bench/README.md index bf7fd29c..ead4da45 100644 --- a/tools/llama-bench/README.md +++ b/tools/llama-bench/README.md @@ -30,8 +30,10 @@ options: --delay <0...N> (seconds) delay between each test (default: 0) -o, --output output format printed to stdout (default: md) -oe, --output-err output format printed to stderr (default: none) + --list-devices list available devices and exit -v, --verbose verbose output --progress print test progress indicators + -rpc, --rpc register RPC devices (comma separated) test parameters: -m, --model (default: models/7B/ggml-model-q4_0.gguf) @@ -48,11 +50,12 @@ test parameters: --cpu-strict <0|1> (default: 0) --poll <0...100> (default: 50) -ngl, --n-gpu-layers (default: 99) - -rpc, --rpc (default: none) + -ncmoe, --n-cpu-moe (default: 0) -sm, --split-mode (default: layer) -mg, --main-gpu (default: 0) -nkvo, --no-kv-offload <0|1> (default: 0) -fa, --flash-attn <0|1> (default: 0) + -dev, --device (default: auto) -mmp, --mmap <0|1> (default: 1) -embd, --embeddings <0|1> (default: 0) -ts, --tensor-split (default: 0) diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index ad47bf14..275ba367 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -17,6 +17,7 @@ #include #include #include +#include #include "common.h" #include "ggml.h" @@ -135,6 +136,101 @@ static std::string get_gpu_info() { return join(gpu_list, ", "); } +static std::vector parse_devices_arg(const std::string & value) { + std::vector devices; + std::string trimmed = string_strip(value); + if (trimmed.empty()) { + throw std::invalid_argument("no devices specified"); + } + if (trimmed == "auto") { + return devices; + } + + auto dev_names = string_split(trimmed, '/'); + if (dev_names.size() == 1 && string_strip(dev_names[0]) == "none") { + devices.push_back(nullptr); + return devices; + } + + for (auto & name : dev_names) { + std::string dev_name = string_strip(name); + if (dev_name.empty()) { + throw std::invalid_argument("invalid device specification"); + } + auto * dev = ggml_backend_dev_by_name(dev_name.c_str()); + if (!dev || ggml_backend_dev_type(dev) == GGML_BACKEND_DEVICE_TYPE_CPU) { + throw std::invalid_argument(string_format("invalid device: %s", dev_name.c_str())); + } + devices.push_back(dev); + } + + devices.push_back(nullptr); + return devices; +} + +static std::vector register_rpc_device_list(const std::string & servers) { + auto rpc_servers = string_split(servers, ','); + if (rpc_servers.empty()) { + throw std::invalid_argument("no RPC servers specified"); + } + + auto * rpc_reg = ggml_backend_reg_by_name("RPC"); + if (!rpc_reg) { + throw std::invalid_argument("failed to find RPC backend"); + } + + using add_rpc_device_fn = ggml_backend_dev_t (*)(const char * endpoint); + auto * ggml_backend_rpc_add_device_fn = (add_rpc_device_fn) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device"); + if (!ggml_backend_rpc_add_device_fn) { + throw std::invalid_argument("failed to find RPC device add function"); + } + + static std::unordered_set registered; + std::vector devices; + for (const auto & server : rpc_servers) { + ggml_backend_dev_t dev = nullptr; + + std::string name = string_format("RPC[%s]", server.c_str()); + + if (registered.find(server) != registered.end()) { + dev = ggml_backend_dev_by_name(name.c_str()); + } + + if (!dev) { + dev = ggml_backend_rpc_add_device_fn(server.c_str()); + if (!dev) { + throw std::invalid_argument(string_format("failed to add RPC device for server '%s'", server.c_str())); + } + ggml_backend_device_register(dev); + registered.insert(server); + } + + devices.push_back(dev); + } + + return devices; +} + +static std::string devices_to_string(const std::vector & devices) { + if (devices.empty()) { + return "auto"; + } + + if (devices.size() == 1 && devices[0] == nullptr) { + return "none"; + } + + std::vector names; + for (auto * dev : devices) { + if (dev == nullptr) { + break; + } + names.push_back(ggml_backend_dev_name(dev)); + } + + return join(names, "/"); +} + // command line params enum output_formats { NONE, CSV, JSON, JSONL, MARKDOWN, SQL }; @@ -251,11 +347,11 @@ struct cmd_params { std::vector poll; std::vector n_gpu_layers; std::vector n_cpu_moe; - std::vector rpc_servers; std::vector split_mode; std::vector main_gpu; std::vector no_kv_offload; std::vector flash_attn; + std::vector> devices; std::vector> tensor_split; std::vector> tensor_buft_overrides; std::vector use_mmap; @@ -288,11 +384,11 @@ static const cmd_params cmd_params_defaults = { /* poll */ { 50 }, /* n_gpu_layers */ { 99 }, /* n_cpu_moe */ { 0 }, - /* rpc_servers */ { "" }, /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, /* main_gpu */ { 0 }, /* no_kv_offload */ { false }, /* flash_attn */ { false }, + /* devices */ { {} }, /* tensor_split */ { std::vector(llama_max_devices(), 0.0f) }, /* tensor_buft_overrides*/ { std::vector{ { nullptr, nullptr } } }, /* use_mmap */ { true }, @@ -325,9 +421,13 @@ static void print_usage(int /* argc */, char ** argv) { output_format_str(cmd_params_defaults.output_format)); printf(" -oe, --output-err output format printed to stderr (default: %s)\n", output_format_str(cmd_params_defaults.output_format_stderr)); + printf(" --list-devices list available devices and exit\n"); printf(" -v, --verbose verbose output\n"); printf(" --progress print test progress indicators\n"); printf(" --no-warmup skip warmup runs before benchmarking\n"); + if (llama_supports_rpc()) { + printf(" -rpc, --rpc register RPC devices (comma separated)\n"); + } printf("\n"); printf("test parameters:\n"); printf(" -m, --model (default: %s)\n", join(cmd_params_defaults.model, ",").c_str()); @@ -357,10 +457,6 @@ static void print_usage(int /* argc */, char ** argv) { join(cmd_params_defaults.n_gpu_layers, ",").c_str()); printf(" -ncmoe, --n-cpu-moe (default: %s)\n", join(cmd_params_defaults.n_cpu_moe, ",").c_str()); - if (llama_supports_rpc()) { - printf(" -rpc, --rpc (default: %s)\n", - join(cmd_params_defaults.rpc_servers, ",").c_str()); - } printf(" -sm, --split-mode (default: %s)\n", join(transform_to_str(cmd_params_defaults.split_mode, split_mode_str), ",").c_str()); printf(" -mg, --main-gpu (default: %s)\n", @@ -369,6 +465,7 @@ static void print_usage(int /* argc */, char ** argv) { join(cmd_params_defaults.no_kv_offload, ",").c_str()); printf(" -fa, --flash-attn <0|1> (default: %s)\n", join(cmd_params_defaults.flash_attn, ",").c_str()); + printf(" -dev, --device (default: auto)\n"); printf(" -mmp, --mmap <0|1> (default: %s)\n", join(cmd_params_defaults.use_mmap, ",").c_str()); printf(" -embd, --embeddings <0|1> (default: %s)\n", @@ -533,6 +630,42 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { break; } params.type_v.insert(params.type_v.end(), types.begin(), types.end()); + } else if (arg == "-dev" || arg == "--device") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto combos = string_split(argv[i], split_delim); + for (const auto & combo : combos) { + try { + params.devices.push_back(parse_devices_arg(combo)); + } catch (const std::exception & e) { + fprintf(stderr, "error: %s\n", e.what()); + invalid_param = true; + break; + } + } + if (invalid_param) { + break; + } + } else if (arg == "--list-devices") { + std::vector devices; + for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { + auto * dev = ggml_backend_dev_get(i); + if (ggml_backend_dev_type(dev) != GGML_BACKEND_DEVICE_TYPE_CPU) { + devices.push_back(dev); + } + } + printf("Available devices:\n"); + if (devices.empty()) { + printf(" (none)\n"); + } + for (auto * dev : devices) { + size_t free, total; + ggml_backend_dev_memory(dev, &free, &total); + printf(" %s: %s (%zu MiB, %zu MiB free)\n", ggml_backend_dev_name(dev), ggml_backend_dev_description(dev), total / 1024 / 1024, free / 1024 / 1024); + } + exit(0); } else if (arg == "-t" || arg == "--threads") { if (++i >= argc) { invalid_param = true; @@ -580,7 +713,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { invalid_param = true; break; } - params.rpc_servers.push_back(argv[i]); + try { + register_rpc_device_list(argv[i]); + } catch (const std::exception & e) { + fprintf(stderr, "error: %s\n", e.what()); + invalid_param = true; + break; + } } else if (arg == "-sm" || arg == "--split-mode") { if (++i >= argc) { invalid_param = true; @@ -855,9 +994,6 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.n_cpu_moe.empty()) { params.n_cpu_moe = cmd_params_defaults.n_cpu_moe; } - if (params.rpc_servers.empty()) { - params.rpc_servers = cmd_params_defaults.rpc_servers; - } if (params.split_mode.empty()) { params.split_mode = cmd_params_defaults.split_mode; } @@ -870,6 +1006,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.flash_attn.empty()) { params.flash_attn = cmd_params_defaults.flash_attn; } + if (params.devices.empty()) { + params.devices = cmd_params_defaults.devices; + } if (params.tensor_split.empty()) { params.tensor_split = cmd_params_defaults.tensor_split; } @@ -916,11 +1055,11 @@ struct cmd_params_instance { int poll; int n_gpu_layers; int n_cpu_moe; - std::string rpc_servers_str; llama_split_mode split_mode; int main_gpu; bool no_kv_offload; bool flash_attn; + std::vector devices; std::vector tensor_split; std::vector tensor_buft_overrides; bool use_mmap; @@ -931,57 +1070,8 @@ struct cmd_params_instance { llama_model_params mparams = llama_model_default_params(); mparams.n_gpu_layers = n_gpu_layers; - if (!rpc_servers_str.empty()) { - auto rpc_servers = string_split(rpc_servers_str, ','); - - // add RPC devices - if (!rpc_servers.empty()) { - ggml_backend_reg_t rpc_reg = ggml_backend_reg_by_name("RPC"); - if (!rpc_reg) { - fprintf(stderr, "%s: failed to find RPC backend\n", __func__); - exit(1); - } - - typedef ggml_backend_dev_t (*ggml_backend_rpc_add_device_t)(const char * endpoint); - ggml_backend_rpc_add_device_t ggml_backend_rpc_add_device_fn = (ggml_backend_rpc_add_device_t) ggml_backend_reg_get_proc_address(rpc_reg, "ggml_backend_rpc_add_device"); - if (!ggml_backend_rpc_add_device_fn) { - fprintf(stderr, "%s: failed to find RPC device add function\n", __func__); - exit(1); - } - static std::vector devices; - devices.clear(); - // RPC devices should always come first for performance reasons - for (const std::string & server : rpc_servers) { - ggml_backend_dev_t dev = ggml_backend_rpc_add_device_fn(server.c_str()); - if (dev) { - devices.push_back(dev); - } else { - fprintf(stderr, "%s: failed to add RPC device for server '%s'\n", __func__, server.c_str()); - exit(1); - } - } - // FIXME: use llama.cpp device selection logic - // add local GPU devices if any - for (size_t i = 0; i < ggml_backend_dev_count(); ++i) { - ggml_backend_dev_t dev = ggml_backend_dev_get(i); - switch (ggml_backend_dev_type(dev)) { - case GGML_BACKEND_DEVICE_TYPE_CPU: - case GGML_BACKEND_DEVICE_TYPE_ACCEL: - // skip CPU backends since they are handled separately - break; - - case GGML_BACKEND_DEVICE_TYPE_GPU: - devices.push_back(dev); - break; - - case GGML_BACKEND_DEVICE_TYPE_IGPU: - // iGPUs are not used when there are RPC servers - break; - } - } - devices.push_back(nullptr); - mparams.devices = devices.data(); - } + if (!devices.empty()) { + mparams.devices = const_cast(devices.data()); } mparams.split_mode = split_mode; mparams.main_gpu = main_gpu; @@ -1029,8 +1119,9 @@ struct cmd_params_instance { bool equal_mparams(const cmd_params_instance & other) const { return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe && - rpc_servers_str == other.rpc_servers_str && split_mode == other.split_mode && + split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split && + devices == other.devices && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); } @@ -1060,9 +1151,9 @@ static std::vector get_cmd_params_instances(const cmd_param for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) for (const auto & ncmoe : params.n_cpu_moe) - for (const auto & rpc : params.rpc_servers) for (const auto & sm : params.split_mode) for (const auto & mg : params.main_gpu) + for (const auto & devs : params.devices) for (const auto & ts : params.tensor_split) for (const auto & ot : params.tensor_buft_overrides) for (const auto & mmp : params.use_mmap) @@ -1098,11 +1189,11 @@ static std::vector get_cmd_params_instances(const cmd_param /* .poll = */ pl, /* .n_gpu_layers = */ nl, /* .n_cpu_moe = */ ncmoe, - /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, /* .flash_attn = */ fa, + /* .devices = */ devs, /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, @@ -1131,11 +1222,11 @@ static std::vector get_cmd_params_instances(const cmd_param /* .poll = */ pl, /* .n_gpu_layers = */ nl, /* .n_cpu_moe = */ ncmoe, - /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, /* .flash_attn = */ fa, + /* .devices = */ devs, /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, @@ -1164,11 +1255,11 @@ static std::vector get_cmd_params_instances(const cmd_param /* .poll = */ pl, /* .n_gpu_layers = */ nl, /* .n_cpu_moe = */ ncmoe, - /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, /* .no_kv_offload= */ nkvo, /* .flash_attn = */ fa, + /* .devices = */ devs, /* .tensor_split = */ ts, /* .tensor_buft_overrides = */ ot, /* .use_mmap = */ mmp, @@ -1206,6 +1297,7 @@ struct test { int main_gpu; bool no_kv_offload; bool flash_attn; + std::vector devices; std::vector tensor_split; std::vector tensor_buft_overrides; bool use_mmap; @@ -1241,6 +1333,7 @@ struct test { main_gpu = inst.main_gpu; no_kv_offload = inst.no_kv_offload; flash_attn = inst.flash_attn; + devices = inst.devices; tensor_split = inst.tensor_split; tensor_buft_overrides = inst.tensor_buft_overrides; use_mmap = inst.use_mmap; @@ -1287,14 +1380,14 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { - "build_commit", "build_number", "cpu_info", "gpu_info", "backends", - "model_filename", "model_type", "model_size", "model_n_params", "n_batch", - "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", - "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", - "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", - "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", - "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", - "stddev_ts" + "build_commit", "build_number", "cpu_info", "gpu_info", "backends", + "model_filename", "model_type", "model_size", "model_n_params", "n_batch", + "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", + "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", + "main_gpu", "no_kv_offload", "flash_attn", "devices", "tensor_split", + "tensor_buft_overrides", "use_mmap", "embeddings", "no_op_offload", + "n_prompt", "n_gen", "n_depth", "test_time", "avg_ns", + "stddev_ns", "avg_ts", "stddev_ts" }; return fields; } @@ -1378,6 +1471,7 @@ struct test { std::to_string(main_gpu), std::to_string(no_kv_offload), std::to_string(flash_attn), + devices_to_string(devices), tensor_split_str, tensor_buft_overrides_str, std::to_string(use_mmap), @@ -1559,6 +1653,9 @@ struct markdown_printer : public printer { if (field == "flash_attn") { return 2; } + if (field == "devices") { + return -12; + } if (field == "use_mmap") { return 4; } @@ -1602,6 +1699,9 @@ struct markdown_printer : public printer { if (field == "no_op_offload") { return "nopo"; } + if (field == "devices") { + return "dev"; + } if (field == "tensor_split") { return "ts"; } @@ -1661,6 +1761,9 @@ struct markdown_printer : public printer { if (params.flash_attn.size() > 1 || params.flash_attn != cmd_params_defaults.flash_attn) { fields.emplace_back("flash_attn"); } + if (params.devices.size() > 1 || params.devices != cmd_params_defaults.devices) { + fields.emplace_back("devices"); + } if (params.tensor_split.size() > 1 || params.tensor_split != cmd_params_defaults.tensor_split) { fields.emplace_back("tensor_split"); }