llama : add --n-cpu-moe option (#15077)

author Diego Devesa <redacted>

Mon, 4 Aug 2025 23:05:36 +0000 (16:05 -0700)

committer GitHub <redacted>

Mon, 4 Aug 2025 23:05:36 +0000 (01:05 +0200)
author Diego Devesa <redacted>
Mon, 4 Aug 2025 23:05:36 +0000 (16:05 -0700)
committer GitHub <redacted>
Mon, 4 Aug 2025 23:05:36 +0000 (01:05 +0200)
diff --git a/common/arg.cpp b/common/arg.cpp

index a02db0b0a0db6a1af4d61419021f67a6b8a5fb51..013616cc3de8c006fe52500d19a2bca6ca671ccb 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -24,6 +24,7 @@
  #include <cstdarg>
  #include <filesystem>
  #include <fstream>
+#include <list>
  #include <regex>
  #include <set>
  #include <string>
@@ -2375,20 +2376,35 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
                      }
                      throw std::invalid_argument("unknown buffer type");
                  }
-                // FIXME: this leaks memory
-                params.tensor_buft_overrides.push_back({strdup(tensor_name.c_str()), buft_list.at(buffer_type)});
+                // keep strings alive and avoid leaking memory by storing them in a static vector
+                static std::list<std::string> buft_overrides;
+                buft_overrides.push_back(tensor_name);
+                params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), buft_list.at(buffer_type)});
              }
          }
      ));
      add_opt(common_arg(
-        {"--cpu-moe"},
-        "use CPU for Mixture of Experts (MoE) weights",
+        {"--cpu-moe", "-cmoe"},
+        "keep all Mixture of Experts (MoE) weights in the CPU",
          [](common_params & params) {
-            params.tensor_buft_overrides.push_back({"\\.ffn_up_exps\\.weight$",   ggml_backend_cpu_buffer_type()});
-            params.tensor_buft_overrides.push_back({"\\.ffn_down_exps\\.weight$", ggml_backend_cpu_buffer_type()});
-            params.tensor_buft_overrides.push_back({"\\.ffn_gate_exps\\.weight$", ggml_backend_cpu_buffer_type()});
+            params.tensor_buft_overrides.push_back({"\\.ffn_(up|down|gate)_exps", ggml_backend_cpu_buffer_type()});
          }
      ).set_env("LLAMA_ARG_CPU_MOE"));
+    add_opt(common_arg(
+        {"--n-cpu-moe", "-ncmoe"}, "N",
+        "keep the Mixture of Experts (MoE) weights of the first N layers in the CPU",
+        [](common_params & params, int value) {
+            if (value < 0) {
+                throw std::invalid_argument("invalid value");
+            }
+            for (int i = 0; i < value; ++i) {
+                // keep strings alive and avoid leaking memory by storing them in a static vector
+                static std::list<std::string> buft_overrides;
+                buft_overrides.push_back(string_format("blk\\.%d\\.ffn_(up|down|gate)_exps", i));
+                params.tensor_buft_overrides.push_back({buft_overrides.back().c_str(), ggml_backend_cpu_buffer_type()});
+            }
+        }
+    ).set_env("LLAMA_ARG_N_CPU_MOE"));
      add_opt(common_arg(
          {"-ngl", "--gpu-layers", "--n-gpu-layers"}, "N",
          "number of layers to store in VRAM",
author	Diego Devesa <redacted>
	Mon, 4 Aug 2025 23:05:36 +0000 (16:05 -0700)
committer	GitHub <redacted>
	Mon, 4 Aug 2025 23:05:36 +0000 (01:05 +0200)