From: Diego Devesa Date: Thu, 31 Jul 2025 18:15:41 +0000 (-0700) Subject: llama : add simple option to enable CPU for MoE weights (--cpu-moe) (#14992) X-Git-Tag: upstream/0.0.6073~22 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=a06ed5feaec9f935fbf662035b2673167bc88460;p=pkg%2Fggml%2Fsources%2Fllama.cpp llama : add simple option to enable CPU for MoE weights (--cpu-moe) (#14992) --- diff --git a/common/arg.cpp b/common/arg.cpp index 0a4a15e7..cd853119 100644 --- a/common/arg.cpp +++ b/common/arg.cpp @@ -2380,6 +2380,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex } } )); + add_opt(common_arg( + {"--cpu-moe"}, + "use CPU for Mixture of Experts (MoE) weights", + [](common_params & params) { + params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$", ggml_backend_cpu_buffer_type()}); + params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()}); + params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()}); + } + ).set_env("LLAMA_ARG_CPU_MOE")); add_opt(common_arg( {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N", "number of layers to store in VRAM",