server: support load model on startup, support preset-only options (#18206)

author Xuan-Son Nguyen <redacted>

Sat, 20 Dec 2025 08:25:27 +0000 (09:25 +0100)

committer GitHub <redacted>

Sat, 20 Dec 2025 08:25:27 +0000 (09:25 +0100)
author Xuan-Son Nguyen <redacted>
Sat, 20 Dec 2025 08:25:27 +0000 (09:25 +0100)
committer GitHub <redacted>
Sat, 20 Dec 2025 08:25:27 +0000 (09:25 +0100)
diff --git a/common/arg.cpp b/common/arg.cpp

index ae3b8b46cac572fb20804f7fd79de25d9366e1eb..476bc0084a4b22b7a1f5d11f43643062b47631d9 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -96,6 +96,11 @@ common_arg & common_arg::set_sparam() {
      return *this;
  }
  
+common_arg & common_arg::set_preset_only() {
+    is_preset_only = true;
+    return *this;
+}
+
  bool common_arg::in_example(enum llama_example ex) {
      return examples.find(ex) != examples.end();
  }
@@ -3494,3 +3499,24 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
  
      return ctx_arg;
  }
+
+void common_params_add_preset_options(std::vector<common_arg> & args) {
+    // arguments below won't be treated as CLI args, only preset options
+    args.push_back(common_arg(
+        {"load-on-startup"}, "NAME",
+        "in server router mode, autoload this model on startup",
+        [](common_params &, const std::string &) { /* unused */ }
+    ).set_env(COMMON_ARG_PRESET_LOAD_ON_STARTUP).set_preset_only());
+
+    // args.push_back(common_arg(
+    //     {"pin"},
+    //     "in server router mode, do not unload this model if models_max is exceeded",
+    //     [](common_params &) { /* unused */ }
+    // ).set_preset_only());
+
+    // args.push_back(common_arg(
+    //     {"unload-idle-seconds"}, "SECONDS",
+    //     "in server router mode, unload models idle for more than this many seconds",
+    //     [](common_params &, int) { /* unused */ }
+    // ).set_preset_only());
+}
diff --git a/common/arg.h b/common/arg.h

index 1321595c1a619041a832ad2722ce8b04502828bc..f5111c658f4045aca50f9da6941a8212c5246a6c 100644 (file)
--- a/common/arg.h
+++ b/common/arg.h
@@ -8,6 +8,9 @@
  #include <vector>
  #include <cstring>
  
+// pseudo-env variable to identify preset-only arguments
+#define COMMON_ARG_PRESET_LOAD_ON_STARTUP "__PRESET_LOAD_ON_STARTUP"
+
  //
  // CLI argument parsing
  //
@@ -22,6 +25,7 @@ struct common_arg {
      const char * env          = nullptr;
      std::string help;
      bool is_sparam = false; // is current arg a sampling param?
+    bool is_preset_only = false; // is current arg preset-only (not treated as CLI arg)
      void (*handler_void)   (common_params & params) = nullptr;
      void (*handler_string) (common_params & params, const std::string &) = nullptr;
      void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
@@ -70,6 +74,7 @@ struct common_arg {
      common_arg & set_excludes(std::initializer_list<enum llama_example> excludes);
      common_arg & set_env(const char * env);
      common_arg & set_sparam();
+    common_arg & set_preset_only();
      bool in_example(enum llama_example ex);
      bool is_exclude(enum llama_example ex);
      bool get_value_from_env(std::string & output) const;
@@ -114,9 +119,13 @@ struct common_params_context {
  bool common_params_parse(int argc, char ** argv, common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
  
  // parse input arguments from CLI into a map
-// TODO: support repeated args in the future
  bool common_params_to_map(int argc, char ** argv, llama_example ex, std::map<common_arg, std::string> & out_map);
  
+// populate preset-only arguments
+// these arguments are not treated as command line arguments
+// see: https://github.com/ggml-org/llama.cpp/issues/18163
+void common_params_add_preset_options(std::vector<common_arg> & args);
+
  // initialize argument parser context - used by test-arg-parser and preset
  common_params_context common_params_parser_init(common_params & params, llama_example ex, void(*print_usage)(int, char **) = nullptr);
  
diff --git a/common/preset.cpp b/common/preset.cpp

index 1aa9864d0aa665d0743bf039c2a759c5085caec6..e2fc18c5dad22c948e1d0eed077f164884e6a5a2 100644 (file)
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -24,7 +24,14 @@ std::vector<std::string> common_preset::to_args(const std::string & bin_path) co
      }
  
      for (const auto & [opt, value] : options) {
-        args.push_back(opt.args.back()); // use the last arg as the main arg
+        if (opt.is_preset_only) {
+            continue; // skip preset-only options (they are not CLI args)
+        }
+
+        // use the last arg as the main arg (i.e. --long-form)
+        args.push_back(opt.args.back());
+
+        // handle value(s)
          if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
              // flag option, no value
              if (common_arg_utils::is_falsey(value)) {
@@ -224,8 +231,10 @@ static std::string parse_bool_arg(const common_arg & arg, const std::string & ke
  }
  
  common_preset_context::common_preset_context(llama_example ex)
-    : ctx_params(common_params_parser_init(default_params, ex)),
-      key_to_opt(get_map_key_opt(ctx_params)) {}
+        : ctx_params(common_params_parser_init(default_params, ex)) {
+    common_params_add_preset_options(ctx_params.options);
+    key_to_opt = get_map_key_opt(ctx_params);
+}
  
  common_presets common_preset_context::load_from_ini(const std::string & path, common_preset & global) const {
      common_presets out;
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp

index 74573c34e9d20cde246895657baa61e80b93525d..1bbb745e78440bfa89c367cfd5db9e3b01b14446 100644 (file)
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -16,6 +16,7 @@ int main(void) {
      for (int ex = 0; ex < LLAMA_EXAMPLE_COUNT; ex++) {
          try {
              auto ctx_arg = common_params_parser_init(params, (enum llama_example)ex);
+            common_params_add_preset_options(ctx_arg.options);
              std::unordered_set<std::string> seen_args;
              std::unordered_set<std::string> seen_env_vars;
              for (const auto & opt : ctx_arg.options) {
diff --git a/tools/server/README.md b/tools/server/README.md

index 7454188f2b82a41357bd363a81adabfca46b4900..a67155c5028b26e93b3c89075ff283d8207e619e 100644 (file)
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -1480,6 +1480,9 @@ The precedence rule for preset options is as follows:
  2. **Model-specific options** defined in the preset file (e.g. `[ggml-org/MY-MODEL...]`)
  3. **Global options** defined in the preset file (`[*]`)
  
+We also offer additional options that are exclusive to presets (these aren't treated as command-line arguments):
+- `load-on-startup` (boolean): Controls whether the model loads automatically when the server starts
+
  ### Routing requests
  
  Requests are routed according to the requested model name.
diff --git a/tools/server/server-models.cpp b/tools/server/server-models.cpp

index db7ab667f938d74569230c73a9b1599ccc529b64..08a0da5c875edfd91c5eec30450731a20f0701dc 100644 (file)
--- a/tools/server/server-models.cpp
+++ b/tools/server/server-models.cpp
@@ -226,6 +226,26 @@ void server_models::load_models() {
              SRV_INF("  %c %s\n", has_custom ? '*' : ' ', name.c_str());
          }
      }
+
+    // load any autoload models
+    std::vector<std::string> models_to_load;
+    for (const auto & [name, inst] : mapping) {
+        std::string val;
+        if (inst.meta.preset.get_option(COMMON_ARG_PRESET_LOAD_ON_STARTUP, val)) {
+            models_to_load.push_back(name);
+        }
+    }
+    if ((int)models_to_load.size() > base_params.models_max) {
+        throw std::runtime_error(string_format(
+            "number of models to load on startup (%zu) exceeds models_max (%d)",
+            models_to_load.size(),
+            base_params.models_max
+        ));
+    }
+    for (const auto & name : models_to_load) {
+        SRV_INF("(startup) loading model %s\n", name.c_str());
+        load(name);
+    }
  }
  
  void server_models::update_meta(const std::string & name, const server_model_meta & meta) {
diff --git a/tools/server/server-models.h b/tools/server/server-models.h

index 56fb398e311e0a572d74565695ee569775ff0094..3e1868c27cc121dac51ac365cafc749d73be219d 100644 (file)
--- a/tools/server/server-models.h
+++ b/tools/server/server-models.h
@@ -103,27 +103,29 @@ public:
  
      void load_models();
  
-    // check if a model instance exists
+    // check if a model instance exists (thread-safe)
      bool has_model(const std::string & name);
  
-    // return a copy of model metadata
+    // return a copy of model metadata (thread-safe)
      std::optional<server_model_meta> get_meta(const std::string & name);
  
-    // return a copy of all model metadata
+    // return a copy of all model metadata (thread-safe)
      std::vector<server_model_meta> get_all_meta();
  
+    // load and unload model instances
+    // these functions are thread-safe
      void load(const std::string & name);
      void unload(const std::string & name);
      void unload_all();
  
-    // update the status of a model instance
+    // update the status of a model instance (thread-safe)
      void update_status(const std::string & name, server_model_status status);
  
-    // wait until the model instance is fully loaded
+    // wait until the model instance is fully loaded (thread-safe)
      // return when the model is loaded or failed to load
      void wait_until_loaded(const std::string & name);
  
-    // load the model if not loaded, otherwise do nothing
+    // load the model if not loaded, otherwise do nothing (thread-safe)
      // return false if model is already loaded; return true otherwise (meta may need to be refreshed)
      bool ensure_model_loaded(const std::string & name);
author	Xuan-Son Nguyen <redacted>
	Sat, 20 Dec 2025 08:25:27 +0000 (09:25 +0100)
committer	GitHub <redacted>
	Sat, 20 Dec 2025 08:25:27 +0000 (09:25 +0100)
common/arg.cpp		patch \| blob \| history
common/arg.h		patch \| blob \| history
common/preset.cpp		patch \| blob \| history
tests/test-arg-parser.cpp		patch \| blob \| history
tools/server/README.md		patch \| blob \| history
tools/server/server-models.cpp		patch \| blob \| history
tools/server/server-models.h		patch \| blob \| history