common: support negated args (#17919)

author Xuan-Son Nguyen <redacted>

Fri, 12 Dec 2025 22:58:53 +0000 (23:58 +0100)

committer GitHub <redacted>

Fri, 12 Dec 2025 22:58:53 +0000 (23:58 +0100)
author Xuan-Son Nguyen <redacted>
Fri, 12 Dec 2025 22:58:53 +0000 (23:58 +0100)
committer GitHub <redacted>
Fri, 12 Dec 2025 22:58:53 +0000 (23:58 +0100)
diff --git a/common/arg.cpp b/common/arg.cpp

index 5528eeb1692083ae8aa2bddc2b9e1cbb1bae4790..16cb2e03a6f4707881dffa066777b615a16a5cd0 100644 (file)
--- a/common/arg.cpp
+++ b/common/arg.cpp
@@ -105,6 +105,16 @@ bool common_arg::is_exclude(enum llama_example ex) {
  
  bool common_arg::get_value_from_env(std::string & output) const {
      if (env == nullptr) return false;
+    if (!args_neg.empty()) {
+        // for compatibility, we need to check LLAMA_ARG_NO_ env as well
+        std::string neg_env = env;
+        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+        char * neg_value = std::getenv(neg_env.c_str());
+        if (neg_value) {
+            output = "0"; // falsey
+            return true;
+        }
+    }
      char * value = std::getenv(env);
      if (value) {
          output = value;
@@ -114,6 +124,14 @@ bool common_arg::get_value_from_env(std::string & output) const {
  }
  
  bool common_arg::has_value_from_env() const {
+    if (env != nullptr && !args_neg.empty()) {
+        // for compatibility, we need to check LLAMA_ARG_NO_ env as well
+        std::string neg_env = env;
+        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+        if (std::getenv(neg_env.c_str())) {
+            return true;
+        }
+    }
      return env != nullptr && std::getenv(env);
  }
  
@@ -151,9 +169,10 @@ std::string common_arg::to_string() const {
      std::string leading_spaces(n_leading_spaces, ' ');
  
      std::ostringstream ss;
-    for (const auto arg : args) {
-        if (arg == args.front()) {
-            if (args.size() == 1) {
+    auto all_args = get_args(); // also contains args_neg
+    for (const auto & arg : all_args) {
+        if (arg == all_args.front()) {
+            if (all_args.size() == 1) {
                  ss << arg;
              } else {
                  // first arg is usually abbreviation, we need padding to make it more beautiful
@@ -162,7 +181,7 @@ std::string common_arg::to_string() const {
                  ss << tmp << spaces;
              }
          } else {
-            ss << arg << (arg != args.back() ? ", " : "");
+            ss << arg << (arg != all_args.back() ? ", " : "");
          }
      }
      if (value_hint) ss << " " << value_hint;
@@ -181,6 +200,31 @@ std::string common_arg::to_string() const {
      return ss.str();
  }
  
+std::vector<std::string> common_arg::get_args() const {
+    std::vector<std::string> result;
+    for (const auto & arg : args) {
+        result.push_back(std::string(arg));
+    }
+    for (const auto & arg : args_neg) {
+        result.push_back(std::string(arg));
+    }
+    return result;
+}
+
+std::vector<std::string> common_arg::get_env() const {
+    std::vector<std::string> result;
+    if (env) {
+        result.push_back(std::string(env));
+    }
+    if (!args_neg.empty() && env) {
+        // for compatibility, we need to add LLAMA_ARG_NO_ variant
+        std::string neg_env = env;
+        string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+        result.push_back(neg_env);
+    }
+    return result;
+}
+
  //
  // utils
  //
@@ -316,6 +360,16 @@ static std::string get_all_kv_cache_types() {
      return msg.str();
  }
  
+static bool parse_bool_value(const std::string & value) {
+    if (is_truthy(value)) {
+        return true;
+    } else if (is_falsey(value)) {
+        return false;
+    } else {
+        throw std::invalid_argument("invalid boolean value");
+    }
+}
+
  //
  // CLI argument parsing functions
  //
@@ -323,10 +377,13 @@ static std::string get_all_kv_cache_types() {
  static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
      common_params & params = ctx_arg.params;
  
-    std::unordered_map<std::string, common_arg *> arg_to_options;
+    std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
      for (auto & opt : ctx_arg.options) {
          for (const auto & arg : opt.args) {
-            arg_to_options[arg] = &opt;
+            arg_to_options[arg] = {&opt, /* is_positive */ true};
+        }
+        for (const auto & arg : opt.args_neg) {
+            arg_to_options[arg] = {&opt, /* is_positive */ false};
          }
      }
  
@@ -335,12 +392,15 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
          std::string value;
          if (opt.get_value_from_env(value)) {
              try {
-                if (opt.handler_void && (value == "1" || value == "true")) {
+                if (opt.handler_void && is_truthy(value)) {
                      opt.handler_void(params);
                  }
                  if (opt.handler_int) {
                      opt.handler_int(params, std::stoi(value));
                  }
+                if (opt.handler_bool) {
+                    opt.handler_bool(params, parse_bool_value(value));
+                }
                  if (opt.handler_string) {
                      opt.handler_string(params, value);
                      continue;
@@ -369,7 +429,9 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
          if (arg_to_options.find(arg) == arg_to_options.end()) {
              throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
          }
-        auto opt = *arg_to_options[arg];
+        auto & tmp = arg_to_options[arg];
+        auto opt = *tmp.first;
+        bool is_positive = tmp.second;
          if (opt.has_value_from_env()) {
              fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
          }
@@ -378,6 +440,10 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
                  opt.handler_void(params);
                  continue;
              }
+            if (opt.handler_bool) {
+                opt.handler_bool(params, is_positive);
+                continue;
+            }
  
              // arg with single value
              check_arg(i);
@@ -402,7 +468,7 @@ static bool common_params_parse_ex(int argc, char ** argv, common_params_context
              throw std::invalid_argument(string_format(
                  "error while handling argument \"%s\": %s\n\n"
                  "usage:\n%s\n\nto show complete usage, run with -h",
-                arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
+                arg.c_str(), e.what(), opt.to_string().c_str()));
          }
      }
  
@@ -750,11 +816,11 @@ static std::string list_builtin_chat_templates() {
  }
  
  bool common_arg_utils::is_truthy(const std::string & value) {
-    return value == "on" || value == "enabled" || value == "1";
+    return value == "on" || value == "enabled" || value == "true" || value == "1";
  }
  
  bool common_arg_utils::is_falsey(const std::string & value) {
-    return value == "off" || value == "disabled" || value == "0";
+    return value == "off" || value == "disabled" || value == "false" || value == "0";
  }
  
  bool common_arg_utils::is_autoy(const std::string & value) {
@@ -839,10 +905,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          }
      ));
      add_opt(common_arg(
+        {"--display-prompt"},
          {"--no-display-prompt"},
-        string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
-        [](common_params & params) {
-            params.display_prompt = false;
+        string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
+        [](common_params & params, bool value) {
+            params.display_prompt = value;
          }
      ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
      add_opt(common_arg(
@@ -1055,18 +1122,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
              params.kv_unified = true;
          }
      ).set_env("LLAMA_ARG_KV_UNIFIED"));
-    add_opt(common_arg(
-        {"--no-context-shift"},
-        string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
-        [](common_params & params) {
-            params.ctx_shift = false;
-        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
      add_opt(common_arg(
          {"--context-shift"},
-        string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.ctx_shift = true;
+        {"--no-context-shift"},
+        string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.ctx_shift = value;
          }
      ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
      add_opt(common_arg(
@@ -1106,20 +1167,22 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          }
      ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
      add_opt(common_arg(
+        {"--perf"},
          {"--no-perf"},
-        string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
-        [](common_params & params) {
-            params.no_perf = true;
-            params.sampling.no_perf = true;
+        string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
+        [](common_params & params, bool value) {
+            params.no_perf = !value;
+            params.sampling.no_perf = !value;
          }
-    ).set_env("LLAMA_ARG_NO_PERF"));
+    ).set_env("LLAMA_ARG_PERF"));
      add_opt(common_arg(
+        {"--show-timings"},
          {"--no-show-timings"},
-        string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
-        [](common_params & params) {
-            params.show_timings = false;
+        string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
+        [](common_params & params, bool value) {
+            params.show_timings = value;
          }
-    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS"));
+    ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
      add_opt(common_arg(
          {"-f", "--file"}, "FNAME",
          "a file containing the prompt (default: none)",
@@ -1171,16 +1234,10 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
      ).set_excludes({LLAMA_EXAMPLE_SERVER}));
      add_opt(common_arg(
          {"-e", "--escape"},
-        string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
-        [](common_params & params) {
-            params.escape = true;
-        }
-    ));
-    add_opt(common_arg(
          {"--no-escape"},
-        "do not process escape sequences",
-        [](common_params & params) {
-            params.escape = false;
+        string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
+        [](common_params & params, bool value) {
+            params.escape = value;
          }
      ));
      add_opt(common_arg(
@@ -1227,19 +1284,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
      ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
      add_opt(common_arg(
          {"-cnv", "--conversation"},
-        "run in conversation mode:\n"
+        {"-no-cnv", "--no-conversation"},
+        "whether to run in conversation mode:\n"
          "- does not print special tokens and suffix/prefix\n"
          "- interactive mode is also enabled\n"
          "(default: auto enabled if chat template is available)",
-        [](common_params & params) {
-            params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
-        }
-    ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
-    add_opt(common_arg(
-        {"-no-cnv", "--no-conversation"},
-        "force disable conversation mode (default: false)",
-        [](common_params & params) {
-            params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
+        [](common_params & params, bool value) {
+            params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
          }
      ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
      add_opt(common_arg(
@@ -1297,10 +1348,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          }
      ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
      add_opt(common_arg(
+        {"--warmup"},
          {"--no-warmup"},
-        "skip warming up the model with an empty run",
-        [](common_params & params) {
-            params.warmup = false;
+        string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.warmup = value;
          }
      ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
      add_opt(common_arg(
@@ -1702,19 +1754,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          }
      ).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
      add_opt(common_arg(
+        {"-kvo", "--kv-offload"},
          {"-nkvo", "--no-kv-offload"},
-        "disable KV offload",
-        [](common_params & params) {
-            params.no_kv_offload = true;
+        string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
+        [](common_params & params, bool value) {
+            params.no_kv_offload = !value;
          }
-    ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
+    ).set_env("LLAMA_ARG_KV_OFFLOAD"));
      add_opt(common_arg(
+        {"--repack"},
          {"-nr", "--no-repack"},
-        "disable weight repacking",
-        [](common_params & params) {
-            params.no_extra_bufts = true;
+        string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
+        [](common_params & params, bool value) {
+            params.no_extra_bufts = !value;
          }
-    ).set_env("LLAMA_ARG_NO_REPACK"));
+    ).set_env("LLAMA_ARG_REPACK"));
      add_opt(common_arg(
          {"--no-host"},
          "bypass host buffer allowing extra buffers to be used",
@@ -1843,18 +1897,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
      ).set_examples({LLAMA_EXAMPLE_PARALLEL}));
      add_opt(common_arg(
          {"-cb", "--cont-batching"},
-        string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.cont_batching = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
-    add_opt(common_arg(
          {"-nocb", "--no-cont-batching"},
-        "disable continuous batching",
-        [](common_params & params) {
-            params.cont_batching = false;
+        string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.cont_batching = value;
          }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
      add_opt(common_arg(
          {"-mm", "--mmproj"}, "FILE",
          "path to a multimodal projector file. see tools/mtmd/README.md\n"
@@ -1871,19 +1919,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          }
      ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
      add_opt(common_arg(
-        {"--no-mmproj"},
-        "explicitly disable multimodal projector, useful when using -hf",
-        [](common_params & params) {
-            params.no_mmproj = true;
+        {"--mmproj-auto"},
+        {"--no-mmproj", "--no-mmproj-auto"},
+        string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
+        [](common_params & params, bool value) {
+            params.no_mmproj = !value;
          }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
      add_opt(common_arg(
+        {"--mmproj-offload"},
          {"--no-mmproj-offload"},
-        "do not offload multimodal projector to GPU",
-        [](common_params & params) {
-            params.mmproj_use_gpu = false;
+        string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.mmproj_use_gpu = value;
          }
-    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
+    ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
      add_opt(common_arg(
          {"--image", "--audio"}, "FILE",
          "path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
@@ -1923,12 +1973,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          }
      ).set_env("LLAMA_ARG_MLOCK"));
      add_opt(common_arg(
+        {"--mmap"},
          {"--no-mmap"},
-        "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
-        [](common_params & params) {
-            params.use_mmap = false;
+        string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.use_mmap = value;
          }
-    ).set_env("LLAMA_ARG_NO_MMAP"));
+    ).set_env("LLAMA_ARG_MMAP"));
      add_opt(common_arg(
          {"--numa"}, "TYPE",
          "attempt optimizations that help on some NUMA systems\n"
@@ -2116,10 +2167,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          }
      ));
      add_opt(common_arg(
+        {"--op-offload"},
          {"--no-op-offload"},
-        string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
-        [](common_params & params) {
-            params.no_op_offload = true;
+        string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
+        [](common_params & params, bool value) {
+            params.no_op_offload = !value;
          }
      ));
      add_opt(common_arg(
@@ -2315,10 +2367,11 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          }
      ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
      add_opt(common_arg(
+        {"--ppl"},
          {"--no-ppl"},
-        string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
-        [](common_params & params) {
-            params.compute_ppl = false;
+        string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
+        [](common_params & params, bool value) {
+            params.compute_ppl = value;
          }
      ).set_examples({LLAMA_EXAMPLE_IMATRIX}));
      add_opt(common_arg(
@@ -2437,12 +2490,13 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          }
      ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
      add_opt(common_arg(
+        {"--webui"},
          {"--no-webui"},
-        string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.webui = false;
+        string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.webui = value;
          }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
      add_opt(common_arg(
          {"--embedding", "--embeddings"},
          string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
@@ -2547,18 +2601,12 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
      ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
      add_opt(common_arg(
          {"--slots"},
-        string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.endpoint_slots = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
-    add_opt(common_arg(
          {"--no-slots"},
-        "disables slots monitoring endpoint",
-        [](common_params & params) {
-            params.endpoint_slots = false;
+        string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.endpoint_slots = value;
          }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
      add_opt(common_arg(
          {"--slot-save-path"}, "PATH",
          "path to save slot kv cache (default: disabled)",
@@ -2609,26 +2657,21 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          }
      ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
      add_opt(common_arg(
+        {"--models-autoload"},
          {"--no-models-autoload"},
-        "disables automatic loading of models (default: enabled)",
-        [](common_params & params) {
-            params.models_autoload = false;
+        string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.models_autoload = value;
          }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
      add_opt(common_arg(
          {"--jinja"},
-        string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
-        [](common_params & params) {
-            params.use_jinja = true;
-        }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
-    add_opt(common_arg(
          {"--no-jinja"},
-        string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"),
-        [](common_params & params) {
-            params.use_jinja = false;
+        string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
+        [](common_params & params, bool value) {
+            params.use_jinja = value;
          }
-    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
      add_opt(common_arg(
          {"--reasoning-format"}, "FORMAT",
          "controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
@@ -2673,15 +2716,16 @@ common_params_context common_params_parser_init(common_params & params, llama_ex
          }
      ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
      add_opt(common_arg(
+        {"--prefill-assistant"},
          {"--no-prefill-assistant"},
          string_format(
              "whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
              "when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
          ),
-        [](common_params & params) {
-            params.prefill_assistant = false;
+        [](common_params & params, bool value) {
+            params.prefill_assistant = value;
          }
-    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
+    ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
      add_opt(common_arg(
          {"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
          string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
diff --git a/common/arg.h b/common/arg.h

index 219c115e6357e6c4855f44c329279cdb909a081f..6db38da488a3ea9999ac6923cd9062a8b153290d 100644 (file)
--- a/common/arg.h
+++ b/common/arg.h
@@ -16,6 +16,7 @@ struct common_arg {
      std::set<enum llama_example> examples = {LLAMA_EXAMPLE_COMMON};
      std::set<enum llama_example> excludes = {};
      std::vector<const char *> args;
+    std::vector<const char *> args_neg;  // for negated args like --no-xxx
      const char * value_hint   = nullptr; // help text or example for arg value
      const char * value_hint_2 = nullptr; // for second arg value
      const char * env          = nullptr;
@@ -25,6 +26,7 @@ struct common_arg {
      void (*handler_string) (common_params & params, const std::string &) = nullptr;
      void (*handler_str_str)(common_params & params, const std::string &, const std::string &) = nullptr;
      void (*handler_int)    (common_params & params, int) = nullptr;
+    void (*handler_bool)   (common_params & params, bool) = nullptr;
  
      common_arg() = default;
  
@@ -48,6 +50,13 @@ struct common_arg {
          void (*handler)(common_params & params)
      ) : args(args), help(help), handler_void(handler) {}
  
+    common_arg(
+        const std::initializer_list<const char *> & args,
+        const std::initializer_list<const char *> & args_neg,
+        const std::string & help,
+        void (*handler)(common_params & params, bool)
+    ) : args(args), args_neg(args_neg), help(help), handler_bool(handler) {}
+
      // support 2 values for arg
      common_arg(
          const std::initializer_list<const char *> & args,
@@ -80,6 +89,10 @@ struct common_arg {
          }
          return strcmp(args[0], other.args[0]) == 0;
      }
+
+    // get all args and env vars (including negated args/env)
+    std::vector<std::string> get_args() const;
+    std::vector<std::string> get_env() const;
  };
  
  namespace common_arg_utils {
diff --git a/common/preset.cpp b/common/preset.cpp

index 09ac171b720621227e903507df9acb3bebdb48d4..729c27f2cfc1e5af8d38aba10535fb74ab711f83 100644 (file)
--- a/common/preset.cpp
+++ b/common/preset.cpp
@@ -23,8 +23,14 @@ std::vector<std::string> common_preset::to_args() const {
          if (opt.value_hint == nullptr && opt.value_hint_2 == nullptr) {
              // flag option, no value
              if (common_arg_utils::is_falsey(value)) {
-                // skip the flag
-                args.pop_back();
+                // use negative arg if available
+                if (!opt.args_neg.empty()) {
+                    args.back() = opt.args_neg.back();
+                } else {
+                    // otherwise, skip the flag
+                    // TODO: maybe throw an error instead?
+                    args.pop_back();
+                }
              }
          }
          if (opt.value_hint != nullptr) {
@@ -141,10 +147,10 @@ static std::map<std::string, std::map<std::string, std::string>> parse_ini_from_
  static std::map<std::string, common_arg> get_map_key_opt(common_params_context & ctx_params) {
      std::map<std::string, common_arg> mapping;
      for (const auto & opt : ctx_params.options) {
-        if (opt.env != nullptr) {
-            mapping[opt.env] = opt;
+        for (const auto & env : opt.get_env()) {
+            mapping[env] = opt;
          }
-        for (const auto & arg : opt.args) {
+        for (const auto & arg : opt.get_args()) {
              mapping[rm_leading_dashes(arg)] = opt;
          }
      }
diff --git a/examples/gen-docs/gen-docs.cpp b/examples/gen-docs/gen-docs.cpp

index 420195f19858a6b3a5e6306831cd89af2c911fb0..e9f7bf93130f0502af88224b4128573365fd14ff 100644 (file)
--- a/examples/gen-docs/gen-docs.cpp
+++ b/examples/gen-docs/gen-docs.cpp
@@ -14,12 +14,13 @@ static void write_table_header(std::ofstream & file) {
  static void write_table_entry(std::ofstream & file, const common_arg & opt) {
      file << "| `";
      // args
-    for (const auto & arg : opt.args) {
-    if (arg == opt.args.front()) {
+    auto all_args = opt.get_args();
+    for (const auto & arg : all_args) {
+    if (arg == all_args.front()) {
              file << arg;
-            if (opt.args.size() > 1) file << ", ";
+            if (all_args.size() > 1) file << ", ";
          } else {
-            file << arg << (arg != opt.args.back() ? ", " : "");
+            file << arg << (arg != all_args.back() ? ", " : "");
          }
      }
      // value hint
diff --git a/tests/test-arg-parser.cpp b/tests/test-arg-parser.cpp

index a60ca12fe592d4cae286d6a9f7b52b073b73de30..90750b20c2ae30050b4f890f76f20d816deefb4e 100644 (file)
--- a/tests/test-arg-parser.cpp
+++ b/tests/test-arg-parser.cpp
@@ -20,20 +20,20 @@ int main(void) {
              std::unordered_set<std::string> seen_env_vars;
              for (const auto & opt : ctx_arg.options) {
                  // check for args duplications
-                for (const auto & arg : opt.args) {
+                for (const auto & arg : opt.get_args()) {
                      if (seen_args.find(arg) == seen_args.end()) {
                          seen_args.insert(arg);
                      } else {
-                        fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg);
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same argument: %s", arg.c_str());
                          exit(1);
                      }
                  }
                  // check for env var duplications
-                if (opt.env) {
-                    if (seen_env_vars.find(opt.env) == seen_env_vars.end()) {
-                        seen_env_vars.insert(opt.env);
+                for (const auto & env : opt.get_env()) {
+                    if (seen_env_vars.find(env) == seen_env_vars.end()) {
+                        seen_env_vars.insert(env);
                      } else {
-                        fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", opt.env);
+                        fprintf(stderr, "test-arg-parser: found different handlers for the same env var: %s", env.c_str());
                          exit(1);
                      }
                  }
@@ -115,6 +115,14 @@ int main(void) {
      assert(params.model.path == "blah.gguf");
      assert(params.cpuparams.n_threads == 1010);
  
+    printf("test-arg-parser: test negated environment variables\n\n");
+
+    setenv("LLAMA_ARG_MMAP", "0", true);
+    setenv("LLAMA_ARG_NO_PERF", "1", true); // legacy format
+    argv = {"binary_name"};
+    assert(true == common_params_parse(argv.size(), list_str_to_char(argv).data(), params, LLAMA_EXAMPLE_COMMON));
+    assert(params.use_mmap == false);
+    assert(params.no_perf == true);
  
      printf("test-arg-parser: test environment variables being overwritten\n\n");
  
diff --git a/tools/server/README.md b/tools/server/README.md

index 91cafa9425bbd4dbfb9d43095bb1773296f0d2e2..073bcd2ccd7f89f65e3116beacb8e290fd139f7e 100644 (file)
--- a/tools/server/README.md
+++ b/tools/server/README.md
@@ -54,9 +54,8 @@ For the ful list of features, please refer to [server's changelog](https://githu
  | `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
  | `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
  | `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
-| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
-| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
-| `--no-escape` | do not process escape sequences |
+| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
+| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
  | `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
  | `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
  | `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
@@ -66,15 +65,15 @@ For the ful list of features, please refer to [server's changelog](https://githu
  | `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
  | `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
  | `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
-| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
-| `-nr, --no-repack` | disable weight repacking<br/>(env: LLAMA_ARG_NO_REPACK) |
-| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
+| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
+| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
+| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_HOST) |
  | `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
  | `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
  | `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
  | `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
  | `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
-| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
+| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
  | `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
  | `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
  | `--list-devices` | print list of available devices and exit |
@@ -87,7 +86,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
  | `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
  | `--check-tensors` | check model tensor data for invalid values (default: false) |
  | `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
-| `--no-op-offload` | disable offloading host tensor operations to device (default: false) |
+| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
  | `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
  | `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
  | `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
@@ -157,19 +156,18 @@ For the ful list of features, please refer to [server's changelog](https://githu
  | -------- | ----------- |
  | `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
  | `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
-| `--no-context-shift` | disables context shift on infinite text generation (default: enabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
-| `--context-shift` | enables context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
+| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
  | `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
  | `-sp, --special` | special tokens output enabled (default: false) |
-| `--no-warmup` | skip warming up the model with an empty run |
+| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
  | `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
  | `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
-| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
-| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
+| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
+| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
  | `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
  | `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
-| `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf<br/>(env: LLAMA_ARG_NO_MMPROJ) |
-| `--no-mmproj-offload` | do not offload multimodal projector to GPU<br/>(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) |
+| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
+| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
  | `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
  | `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
  | `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
@@ -180,7 +178,7 @@ For the ful list of features, please refer to [server's changelog](https://githu
  | `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
  | `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
  | `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
-| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
+| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
  | `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
  | `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
  | `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
@@ -193,20 +191,19 @@ For the ful list of features, please refer to [server's changelog](https://githu
  | `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
  | `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
  | `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
-| `--slots` | enable slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
-| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
+| `--slots, --no-slots` | expose slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
  | `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
+| `--media-path PATH` | directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled) |
  | `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
+| `--models-preset PATH` | path to INI file containing model presets for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_PRESET) |
  | `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
-| `--models-allow-extra-args` | for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: disabled)<br/>(env: LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS) |
-| `--no-models-autoload` | disables automatic loading of models (default: enabled)<br/>(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) |
-| `--jinja` | use jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_JINJA) |
-| `--no-jinja` | disable jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_NO_JINJA) |
+| `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)<br/>(env: LLAMA_ARG_MODELS_AUTOLOAD) |
+| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
  | `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
  | `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
  | `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
  | `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
-| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
+| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
  | `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)<br/> |
  | `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
  | `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
@@ -236,6 +233,11 @@ For the ful list of features, please refer to [server's changelog](https://githu
  
  Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
  
+For boolean options like `--mmap` or `--kv-offload`, the environment variable is handled as shown in this example:
+- `LLAMA_ARG_MMAP=true` means enabled, other accepted values are: `1`, `on`, `enabled`
+- `LLAMA_ARG_MMAP=false` means disabled, other accepted values are: `0`, `off`, `disabled`
+- If `LLAMA_ARG_NO_MMAP` is present (no matter the value), it means disabling mmap
+
  Example usage of docker compose with environment variables:
  
  ```yml
author	Xuan-Son Nguyen <redacted>
	Fri, 12 Dec 2025 22:58:53 +0000 (23:58 +0100)
committer	GitHub <redacted>
	Fri, 12 Dec 2025 22:58:53 +0000 (23:58 +0100)
common/arg.cpp		patch \| blob \| history
common/arg.h		patch \| blob \| history
common/preset.cpp		patch \| blob \| history
examples/gen-docs/gen-docs.cpp		patch \| blob \| history
tests/test-arg-parser.cpp		patch \| blob \| history
tools/server/README.md		patch \| blob \| history