llama : add simple option to enable CPU for MoE weights (--cpu-moe) (#14992)

author Diego Devesa <redacted>

Thu, 31 Jul 2025 18:15:41 +0000 (11:15 -0700)

committer GitHub <redacted>

Thu, 31 Jul 2025 18:15:41 +0000 (20:15 +0200)
author Diego Devesa <redacted>
Thu, 31 Jul 2025 18:15:41 +0000 (11:15 -0700)
committer GitHub <redacted>
Thu, 31 Jul 2025 18:15:41 +0000 (20:15 +0200)
diff --git a/common/arg.cpp b/common/arg.cpp

index 0a4a15e7f40ce13d5e837a4ae05a565a4409a5a5..cd853119131e9cf437b62c625a3c110d11d4d558 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -2380,6 +2380,15 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
              }
          }
      ));
+    add_opt(common_arg(
+        {"--cpu-moe"},
+        "use CPU for Mixture of Experts (MoE) weights",
+        [](common_params & params) {
+            params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$",   ggml_backend_cpu_buffer_type()});
+            params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
+            params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
+        }
+    ).set_env("LLAMA_ARG_CPU_MOE"));
      add_opt(common_arg(
          {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
          "number of layers to store in VRAM",
author	Diego Devesa <redacted>
	Thu, 31 Jul 2025 18:15:41 +0000 (11:15 -0700)
committer	GitHub <redacted>
	Thu, 31 Jul 2025 18:15:41 +0000 (20:15 +0200)