From: jacekpoplawski Date: Tue, 16 Sep 2025 14:17:08 +0000 (+0200) Subject: llama-bench: add --n-cpu-moe support (#15952) X-Git-Tag: upstream/0.0.6527~37 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=8ff206097c2bf3ca1c7aa95f9d6db779fc7bdd68;p=pkg%2Fggml%2Fsources%2Fllama.cpp llama-bench: add --n-cpu-moe support (#15952) * llama-bench: add --n-cpu-moe support Support --n-cpu-moe in llama-bench the same way it is supported by llama-server. --- diff --git a/common/arg.cpp b/common/arg.cpp index 19189d8f..9fd8858e 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2548,7 +2548,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--cpu-moe", "-cmoe"}, "keep all Mixture of Experts (MoE) weights in the CPU", [](common_params & params) { - params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()}); + params.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override()); } ).set_env("LLAMA_ARG_CPU_MOE")); add_opt(common_arg( @@ -2561,7 +2561,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex for (int i = 0; i < value; ++i) { // keep strings alive and avoid leaking memory by storing them in a static vector static std::list buft_overrides; - buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i)); + buft_overrides.push_back(llm_ffn_exps_block_regex(i)); params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()}); } } @@ -2570,7 +2570,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex {"--cpu-moe-draft", "-cmoed"}, "keep all Mixture of Experts (MoE) weights in the CPU for the draft model", [](common_params & params) { - params.speculative.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()}); + params.speculative.tensor_buft_overrides.push_back(llm_ffn_exps_cpu_override()); } ).set_examples({LLAMA_EXAMPLE_SPECULATIVE, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CPU_MOE_DRAFT")); add_opt(common_arg( @@ -2582,7 +2582,7 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } for (int i = 0; i < value; ++i) { static std::list buft_overrides_draft; - buft_overrides_draft.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i)); + buft_overrides_draft.push_back(llm_ffn_exps_block_regex(i)); params.speculative.tensor_buft_overrides.push_back({buft_overrides_draft.back().c_str(), ggml_backend_cpu_buffer_type()}); } } diff --git a/common/common.h b/common/common.h index 5063d73f..83d44dba 100644 --- a/common/common.h +++ b/common/common.h @@ -734,6 +734,20 @@ const char * const LLM_KV_SPLIT_TENSORS_COUNT = "split.tensors.count"; } +// +// MoE utils +// + +const char * const LLM_FFN_EXPS_REGEX = "\\.ffn_(up|down|gate)_exps"; + +static std::string llm_ffn_exps_block_regex(int idx) { + return string_format("blk\\.%d%s", idx, LLM_FFN_EXPS_REGEX); +} + +static llama_model_tensor_buft_override llm_ffn_exps_cpu_override() { + return { LLM_FFN_EXPS_REGEX, ggml_backend_cpu_buffer_type() }; +} + // // training utils // diff --git a/tools/llama-bench/llama-bench.cpp b/tools/llama-bench/llama-bench.cpp index 95f662a2..ad47bf14 100644 --- a/tools/llama-bench/llama-bench.cpp +++ b/tools/llama-bench/llama-bench.cpp @@ -250,6 +250,7 @@ struct cmd_params { std::vector cpu_strict; std::vector poll; std::vector n_gpu_layers; + std::vector n_cpu_moe; std::vector rpc_servers; std::vector split_mode; std::vector main_gpu; @@ -286,6 +287,7 @@ static const cmd_params cmd_params_defaults = { /* cpu_strict */ { false }, /* poll */ { 50 }, /* n_gpu_layers */ { 99 }, + /* n_cpu_moe */ { 0 }, /* rpc_servers */ { "" }, /* split_mode */ { LLAMA_SPLIT_MODE_LAYER }, /* main_gpu */ { 0 }, @@ -353,6 +355,8 @@ static void print_usage(int /* argc */, char ** argv) { printf(" --poll <0...100> (default: %s)\n", join(cmd_params_defaults.poll, ",").c_str()); printf(" -ngl, --n-gpu-layers (default: %s)\n", join(cmd_params_defaults.n_gpu_layers, ",").c_str()); + printf(" -ncmoe, --n-cpu-moe (default: %s)\n", + join(cmd_params_defaults.n_cpu_moe, ",").c_str()); if (llama_supports_rpc()) { printf(" -rpc, --rpc (default: %s)\n", join(cmd_params_defaults.rpc_servers, ",").c_str()); @@ -564,6 +568,13 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { } auto p = parse_int_range(argv[i]); params.n_gpu_layers.insert(params.n_gpu_layers.end(), p.begin(), p.end()); + } else if (arg == "-ncmoe" || arg == "--n-cpu-moe") { + if (++i >= argc) { + invalid_param = true; + break; + } + auto p = parse_int_range(argv[i]); + params.n_cpu_moe.insert(params.n_cpu_moe.end(), p.begin(), p.end()); } else if (llama_supports_rpc() && (arg == "-rpc" || arg == "--rpc")) { if (++i >= argc) { invalid_param = true; @@ -841,6 +852,9 @@ static cmd_params parse_cmd_params(int argc, char ** argv) { if (params.n_gpu_layers.empty()) { params.n_gpu_layers = cmd_params_defaults.n_gpu_layers; } + if (params.n_cpu_moe.empty()) { + params.n_cpu_moe = cmd_params_defaults.n_cpu_moe; + } if (params.rpc_servers.empty()) { params.rpc_servers = cmd_params_defaults.rpc_servers; } @@ -901,6 +915,7 @@ struct cmd_params_instance { bool cpu_strict; int poll; int n_gpu_layers; + int n_cpu_moe; std::string rpc_servers_str; llama_split_mode split_mode; int main_gpu; @@ -973,20 +988,50 @@ struct cmd_params_instance { mparams.tensor_split = tensor_split.data(); mparams.use_mmap = use_mmap; - if (tensor_buft_overrides.empty()) { - mparams.tensor_buft_overrides = nullptr; + if (n_cpu_moe <= 0) { + if (tensor_buft_overrides.empty()) { + mparams.tensor_buft_overrides = nullptr; + } else { + GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && + "Tensor buffer overrides not terminated with empty pattern"); + mparams.tensor_buft_overrides = tensor_buft_overrides.data(); + } } else { - GGML_ASSERT(tensor_buft_overrides.back().pattern == nullptr && "Tensor buffer overrides not terminated with empty pattern"); - mparams.tensor_buft_overrides = tensor_buft_overrides.data(); + static std::vector merged; + static std::vector patterns; + + merged.clear(); + patterns.clear(); + + auto first = tensor_buft_overrides.begin(); + auto last = tensor_buft_overrides.end(); + if (first != last && (last - 1)->pattern == nullptr) { + --last; + } + merged.insert(merged.end(), first, last); + + patterns.reserve((size_t) n_cpu_moe); + merged.reserve(merged.size() + (size_t) n_cpu_moe + 1); + + for (int i = 0; i < n_cpu_moe; ++i) { + patterns.push_back(llm_ffn_exps_block_regex(i)); + merged.push_back({ patterns.back().c_str(), + ggml_backend_cpu_buffer_type() }); + } + + merged.push_back({ nullptr, nullptr }); + + mparams.tensor_buft_overrides = merged.data(); } return mparams; } bool equal_mparams(const cmd_params_instance & other) const { - return model == other.model && n_gpu_layers == other.n_gpu_layers && rpc_servers_str == other.rpc_servers_str && - split_mode == other.split_mode && main_gpu == other.main_gpu && use_mmap == other.use_mmap && - tensor_split == other.tensor_split && vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); + return model == other.model && n_gpu_layers == other.n_gpu_layers && n_cpu_moe == other.n_cpu_moe && + rpc_servers_str == other.rpc_servers_str && split_mode == other.split_mode && + main_gpu == other.main_gpu && use_mmap == other.use_mmap && tensor_split == other.tensor_split && + vec_tensor_buft_override_equal(tensor_buft_overrides, other.tensor_buft_overrides); } llama_context_params to_llama_cparams() const { @@ -1014,6 +1059,7 @@ static std::vector get_cmd_params_instances(const cmd_param // clang-format off for (const auto & m : params.model) for (const auto & nl : params.n_gpu_layers) + for (const auto & ncmoe : params.n_cpu_moe) for (const auto & rpc : params.rpc_servers) for (const auto & sm : params.split_mode) for (const auto & mg : params.main_gpu) @@ -1051,6 +1097,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .cpu_strict = */ cs, /* .poll = */ pl, /* .n_gpu_layers = */ nl, + /* .n_cpu_moe = */ ncmoe, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, @@ -1083,6 +1130,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .cpu_strict = */ cs, /* .poll = */ pl, /* .n_gpu_layers = */ nl, + /* .n_cpu_moe = */ ncmoe, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, @@ -1115,6 +1163,7 @@ static std::vector get_cmd_params_instances(const cmd_param /* .cpu_strict = */ cs, /* .poll = */ pl, /* .n_gpu_layers = */ nl, + /* .n_cpu_moe = */ ncmoe, /* .rpc_servers = */ rpc, /* .split_mode = */ sm, /* .main_gpu = */ mg, @@ -1152,6 +1201,7 @@ struct test { ggml_type type_k; ggml_type type_v; int n_gpu_layers; + int n_cpu_moe; llama_split_mode split_mode; int main_gpu; bool no_kv_offload; @@ -1186,6 +1236,7 @@ struct test { type_k = inst.type_k; type_v = inst.type_v; n_gpu_layers = inst.n_gpu_layers; + n_cpu_moe = inst.n_cpu_moe; split_mode = inst.split_mode; main_gpu = inst.main_gpu; no_kv_offload = inst.no_kv_offload; @@ -1236,12 +1287,14 @@ struct test { static const std::vector & get_fields() { static const std::vector fields = { - "build_commit", "build_number", "cpu_info", "gpu_info", "backends", "model_filename", - "model_type", "model_size", "model_n_params", "n_batch", "n_ubatch", "n_threads", - "cpu_mask", "cpu_strict", "poll", "type_k", "type_v", "n_gpu_layers", - "split_mode", "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", - "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", "n_depth", "test_time", - "avg_ns", "stddev_ns", "avg_ts", "stddev_ts", + "build_commit", "build_number", "cpu_info", "gpu_info", "backends", + "model_filename", "model_type", "model_size", "model_n_params", "n_batch", + "n_ubatch", "n_threads", "cpu_mask", "cpu_strict", "poll", + "type_k", "type_v", "n_gpu_layers", "n_cpu_moe", "split_mode", + "main_gpu", "no_kv_offload", "flash_attn", "tensor_split", "tensor_buft_overrides", + "use_mmap", "embeddings", "no_op_offload", "n_prompt", "n_gen", + "n_depth", "test_time", "avg_ns", "stddev_ns", "avg_ts", + "stddev_ts" }; return fields; } @@ -1251,8 +1304,8 @@ struct test { static field_type get_field_type(const std::string & field) { if (field == "build_number" || field == "n_batch" || field == "n_ubatch" || field == "n_threads" || field == "poll" || field == "model_size" || field == "model_n_params" || field == "n_gpu_layers" || - field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || - field == "avg_ns" || field == "stddev_ns" || field == "no_op_offload") { + field == "main_gpu" || field == "n_prompt" || field == "n_gen" || field == "n_depth" || field == "avg_ns" || + field == "stddev_ns" || field == "no_op_offload" || field == "n_cpu_moe") { return INT; } if (field == "f16_kv" || field == "no_kv_offload" || field == "cpu_strict" || field == "flash_attn" || @@ -1320,6 +1373,7 @@ struct test { ggml_type_name(type_k), ggml_type_name(type_v), std::to_string(n_gpu_layers), + std::to_string(n_cpu_moe), split_mode_str(split_mode), std::to_string(main_gpu), std::to_string(no_kv_offload), @@ -1568,6 +1622,9 @@ struct markdown_printer : public printer { if (!is_cpu_backend) { fields.emplace_back("n_gpu_layers"); } + if (params.n_cpu_moe.size() > 1) { + fields.emplace_back("n_cpu_moe"); + } if (params.n_threads.size() > 1 || params.n_threads != cmd_params_defaults.n_threads || is_cpu_backend) { fields.emplace_back("n_threads"); }