bool common_arg::get_value_from_env(std::string & output) const {
if (env == nullptr) return false;
+ if (!args_neg.empty()) {
+ // for compatibility, we need to check LLAMA_ARG_NO_ env as well
+ std::string neg_env = env;
+ string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+ char * neg_value = std::getenv(neg_env.c_str());
+ if (neg_value) {
+ output = "0"; // falsey
+ return true;
+ }
+ }
char * value = std::getenv(env);
if (value) {
output = value;
}
bool common_arg::has_value_from_env() const {
+ if (env != nullptr && !args_neg.empty()) {
+ // for compatibility, we need to check LLAMA_ARG_NO_ env as well
+ std::string neg_env = env;
+ string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+ if (std::getenv(neg_env.c_str())) {
+ return true;
+ }
+ }
return env != nullptr && std::getenv(env);
}
std::string leading_spaces(n_leading_spaces, ' ');
std::ostringstream ss;
- for (const auto arg : args) {
- if (arg == args.front()) {
- if (args.size() == 1) {
+ auto all_args = get_args(); // also contains args_neg
+ for (const auto & arg : all_args) {
+ if (arg == all_args.front()) {
+ if (all_args.size() == 1) {
ss << arg;
} else {
// first arg is usually abbreviation, we need padding to make it more beautiful
ss << tmp << spaces;
}
} else {
- ss << arg << (arg != args.back() ? ", " : "");
+ ss << arg << (arg != all_args.back() ? ", " : "");
}
}
if (value_hint) ss << " " << value_hint;
return ss.str();
}
+std::vector<std::string> common_arg::get_args() const {
+ std::vector<std::string> result;
+ for (const auto & arg : args) {
+ result.push_back(std::string(arg));
+ }
+ for (const auto & arg : args_neg) {
+ result.push_back(std::string(arg));
+ }
+ return result;
+}
+
+std::vector<std::string> common_arg::get_env() const {
+ std::vector<std::string> result;
+ if (env) {
+ result.push_back(std::string(env));
+ }
+ if (!args_neg.empty() && env) {
+ // for compatibility, we need to add LLAMA_ARG_NO_ variant
+ std::string neg_env = env;
+ string_replace_all(neg_env, "LLAMA_ARG_", "LLAMA_ARG_NO_");
+ result.push_back(neg_env);
+ }
+ return result;
+}
+
//
// utils
//
return msg.str();
}
+static bool parse_bool_value(const std::string & value) {
+ if (is_truthy(value)) {
+ return true;
+ } else if (is_falsey(value)) {
+ return false;
+ } else {
+ throw std::invalid_argument("invalid boolean value");
+ }
+}
+
//
// CLI argument parsing functions
//
static bool common_params_parse_ex(int argc, char ** argv, common_params_context & ctx_arg) {
common_params & params = ctx_arg.params;
- std::unordered_map<std::string, common_arg *> arg_to_options;
+ std::unordered_map<std::string, std::pair<common_arg *, bool>> arg_to_options;
for (auto & opt : ctx_arg.options) {
for (const auto & arg : opt.args) {
- arg_to_options[arg] = &opt;
+ arg_to_options[arg] = {&opt, /* is_positive */ true};
+ }
+ for (const auto & arg : opt.args_neg) {
+ arg_to_options[arg] = {&opt, /* is_positive */ false};
}
}
std::string value;
if (opt.get_value_from_env(value)) {
try {
- if (opt.handler_void && (value == "1" || value == "true")) {
+ if (opt.handler_void && is_truthy(value)) {
opt.handler_void(params);
}
if (opt.handler_int) {
opt.handler_int(params, std::stoi(value));
}
+ if (opt.handler_bool) {
+ opt.handler_bool(params, parse_bool_value(value));
+ }
if (opt.handler_string) {
opt.handler_string(params, value);
continue;
if (arg_to_options.find(arg) == arg_to_options.end()) {
throw std::invalid_argument(string_format("error: invalid argument: %s", arg.c_str()));
}
- auto opt = *arg_to_options[arg];
+ auto & tmp = arg_to_options[arg];
+ auto opt = *tmp.first;
+ bool is_positive = tmp.second;
if (opt.has_value_from_env()) {
fprintf(stderr, "warn: %s environment variable is set, but will be overwritten by command line argument %s\n", opt.env, arg.c_str());
}
opt.handler_void(params);
continue;
}
+ if (opt.handler_bool) {
+ opt.handler_bool(params, is_positive);
+ continue;
+ }
// arg with single value
check_arg(i);
throw std::invalid_argument(string_format(
"error while handling argument \"%s\": %s\n\n"
"usage:\n%s\n\nto show complete usage, run with -h",
- arg.c_str(), e.what(), arg_to_options[arg]->to_string().c_str()));
+ arg.c_str(), e.what(), opt.to_string().c_str()));
}
}
}
bool common_arg_utils::is_truthy(const std::string & value) {
- return value == "on" || value == "enabled" || value == "1";
+ return value == "on" || value == "enabled" || value == "true" || value == "1";
}
bool common_arg_utils::is_falsey(const std::string & value) {
- return value == "off" || value == "disabled" || value == "0";
+ return value == "off" || value == "disabled" || value == "false" || value == "0";
}
bool common_arg_utils::is_autoy(const std::string & value) {
}
));
add_opt(common_arg(
+ {"--display-prompt"},
{"--no-display-prompt"},
- string_format("don't print prompt at generation (default: %s)", !params.display_prompt ? "true" : "false"),
- [](common_params & params) {
- params.display_prompt = false;
+ string_format("whether to print prompt at generation (default: %s)", params.display_prompt ? "true" : "false"),
+ [](common_params & params, bool value) {
+ params.display_prompt = value;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
params.kv_unified = true;
}
).set_env("LLAMA_ARG_KV_UNIFIED"));
- add_opt(common_arg(
- {"--no-context-shift"},
- string_format("disables context shift on infinite text generation (default: %s)", params.ctx_shift ? "disabled" : "enabled"),
- [](common_params & params) {
- params.ctx_shift = false;
- }
- ).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_NO_CONTEXT_SHIFT"));
add_opt(common_arg(
{"--context-shift"},
- string_format("enables context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
- [](common_params & params) {
- params.ctx_shift = true;
+ {"--no-context-shift"},
+ string_format("whether to use context shift on infinite text generation (default: %s)", params.ctx_shift ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.ctx_shift = value;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_IMATRIX, LLAMA_EXAMPLE_PERPLEXITY}).set_env("LLAMA_ARG_CONTEXT_SHIFT"));
add_opt(common_arg(
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_DIFFUSION}));
add_opt(common_arg(
+ {"--perf"},
{"--no-perf"},
- string_format("disable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
- [](common_params & params) {
- params.no_perf = true;
- params.sampling.no_perf = true;
+ string_format("whether to enable internal libllama performance timings (default: %s)", params.no_perf ? "true" : "false"),
+ [](common_params & params, bool value) {
+ params.no_perf = !value;
+ params.sampling.no_perf = !value;
}
- ).set_env("LLAMA_ARG_NO_PERF"));
+ ).set_env("LLAMA_ARG_PERF"));
add_opt(common_arg(
+ {"--show-timings"},
{"--no-show-timings"},
- string_format("disable timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
- [](common_params & params) {
- params.show_timings = false;
+ string_format("whether to show timing information after each response (default: %s)", params.show_timings ? "true" : "false"),
+ [](common_params & params, bool value) {
+ params.show_timings = value;
}
- ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_NO_SHOW_TIMINGS"));
+ ).set_examples({LLAMA_EXAMPLE_CLI}).set_env("LLAMA_ARG_SHOW_TIMINGS"));
add_opt(common_arg(
{"-f", "--file"}, "FNAME",
"a file containing the prompt (default: none)",
).set_excludes({LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-e", "--escape"},
- string_format("process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
- [](common_params & params) {
- params.escape = true;
- }
- ));
- add_opt(common_arg(
{"--no-escape"},
- "do not process escape sequences",
- [](common_params & params) {
- params.escape = false;
+ string_format("whether to process escapes sequences (\\n, \\r, \\t, \\', \\\", \\\\) (default: %s)", params.escape ? "true" : "false"),
+ [](common_params & params, bool value) {
+ params.escape = value;
}
));
add_opt(common_arg(
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}));
add_opt(common_arg(
{"-cnv", "--conversation"},
- "run in conversation mode:\n"
+ {"-no-cnv", "--no-conversation"},
+ "whether to run in conversation mode:\n"
"- does not print special tokens and suffix/prefix\n"
"- interactive mode is also enabled\n"
"(default: auto enabled if chat template is available)",
- [](common_params & params) {
- params.conversation_mode = COMMON_CONVERSATION_MODE_ENABLED;
- }
- ).set_examples({LLAMA_EXAMPLE_COMPLETION}));
- add_opt(common_arg(
- {"-no-cnv", "--no-conversation"},
- "force disable conversation mode (default: false)",
- [](common_params & params) {
- params.conversation_mode = COMMON_CONVERSATION_MODE_DISABLED;
+ [](common_params & params, bool value) {
+ params.conversation_mode = value ? COMMON_CONVERSATION_MODE_ENABLED : COMMON_CONVERSATION_MODE_DISABLED;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI}));
add_opt(common_arg(
}
).set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
+ {"--warmup"},
{"--no-warmup"},
- "skip warming up the model with an empty run",
- [](common_params & params) {
- params.warmup = false;
+ string_format("whether to perform warmup with an empty run (default: %s)", params.warmup ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.warmup = value;
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_MTMD, LLAMA_EXAMPLE_EMBEDDING, LLAMA_EXAMPLE_RETRIEVAL, LLAMA_EXAMPLE_PERPLEXITY}));
add_opt(common_arg(
}
).set_env("LLAMA_ARG_GRP_ATTN_W").set_examples({LLAMA_EXAMPLE_COMPLETION}));
add_opt(common_arg(
+ {"-kvo", "--kv-offload"},
{"-nkvo", "--no-kv-offload"},
- "disable KV offload",
- [](common_params & params) {
- params.no_kv_offload = true;
+ string_format("whether to enable KV cache offloading (default: %s)", params.no_kv_offload ? "disabled" : "enabled"),
+ [](common_params & params, bool value) {
+ params.no_kv_offload = !value;
}
- ).set_env("LLAMA_ARG_NO_KV_OFFLOAD"));
+ ).set_env("LLAMA_ARG_KV_OFFLOAD"));
add_opt(common_arg(
+ {"--repack"},
{"-nr", "--no-repack"},
- "disable weight repacking",
- [](common_params & params) {
- params.no_extra_bufts = true;
+ string_format("whether to enable weight repacking (default: %s)", params.no_extra_bufts ? "disabled" : "enabled"),
+ [](common_params & params, bool value) {
+ params.no_extra_bufts = !value;
}
- ).set_env("LLAMA_ARG_NO_REPACK"));
+ ).set_env("LLAMA_ARG_REPACK"));
add_opt(common_arg(
{"--no-host"},
"bypass host buffer allowing extra buffers to be used",
).set_examples({LLAMA_EXAMPLE_PARALLEL}));
add_opt(common_arg(
{"-cb", "--cont-batching"},
- string_format("enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
- [](common_params & params) {
- params.cont_batching = true;
- }
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
- add_opt(common_arg(
{"-nocb", "--no-cont-batching"},
- "disable continuous batching",
- [](common_params & params) {
- params.cont_batching = false;
+ string_format("whether to enable continuous batching (a.k.a dynamic batching) (default: %s)", params.cont_batching ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.cont_batching = value;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_CONT_BATCHING"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CONT_BATCHING"));
add_opt(common_arg(
{"-mm", "--mmproj"}, "FILE",
"path to a multimodal projector file. see tools/mtmd/README.md\n"
}
).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_URL"));
add_opt(common_arg(
- {"--no-mmproj"},
- "explicitly disable multimodal projector, useful when using -hf",
- [](common_params & params) {
- params.no_mmproj = true;
+ {"--mmproj-auto"},
+ {"--no-mmproj", "--no-mmproj-auto"},
+ string_format("whether to use multimodal projector file (if available), useful when using -hf (default: %s)", params.no_mmproj ? "disabled" : "enabled"),
+ [](common_params & params, bool value) {
+ params.no_mmproj = !value;
}
- ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ"));
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_AUTO"));
add_opt(common_arg(
+ {"--mmproj-offload"},
{"--no-mmproj-offload"},
- "do not offload multimodal projector to GPU",
- [](common_params & params) {
- params.mmproj_use_gpu = false;
+ string_format("whether to enable GPU offloading for multimodal projector (default: %s)", params.mmproj_use_gpu ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.mmproj_use_gpu = value;
}
- ).set_examples(mmproj_examples).set_env("LLAMA_ARG_NO_MMPROJ_OFFLOAD"));
+ ).set_examples(mmproj_examples).set_env("LLAMA_ARG_MMPROJ_OFFLOAD"));
add_opt(common_arg(
{"--image", "--audio"}, "FILE",
"path to an image or audio file. use with multimodal models, can be repeated if you have multiple files\n",
}
).set_env("LLAMA_ARG_MLOCK"));
add_opt(common_arg(
+ {"--mmap"},
{"--no-mmap"},
- "do not memory-map model (slower load but may reduce pageouts if not using mlock)",
- [](common_params & params) {
- params.use_mmap = false;
+ string_format("whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: %s)", params.use_mmap ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.use_mmap = value;
}
- ).set_env("LLAMA_ARG_NO_MMAP"));
+ ).set_env("LLAMA_ARG_MMAP"));
add_opt(common_arg(
{"--numa"}, "TYPE",
"attempt optimizations that help on some NUMA systems\n"
}
));
add_opt(common_arg(
+ {"--op-offload"},
{"--no-op-offload"},
- string_format("disable offloading host tensor operations to device (default: %s)", params.no_op_offload ? "true" : "false"),
- [](common_params & params) {
- params.no_op_offload = true;
+ string_format("whether to offload host tensor operations to device (default: %s)", params.no_op_offload ? "false" : "true"),
+ [](common_params & params, bool value) {
+ params.no_op_offload = !value;
}
));
add_opt(common_arg(
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
+ {"--ppl"},
{"--no-ppl"},
- string_format("do not compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
- [](common_params & params) {
- params.compute_ppl = false;
+ string_format("whether to compute perplexity (default: %s)", params.compute_ppl ? "true" : "false"),
+ [](common_params & params, bool value) {
+ params.compute_ppl = value;
}
).set_examples({LLAMA_EXAMPLE_IMATRIX}));
add_opt(common_arg(
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_API_PREFIX"));
add_opt(common_arg(
+ {"--webui"},
{"--no-webui"},
- string_format("Disable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
- [](common_params & params) {
- params.webui = false;
+ string_format("whether to enable the Web UI (default: %s)", params.webui ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.webui = value;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_WEBUI"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_WEBUI"));
add_opt(common_arg(
{"--embedding", "--embeddings"},
string_format("restrict to only support embedding use case; use only with dedicated embedding models (default: %s)", params.embedding ? "enabled" : "disabled"),
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_PROPS"));
add_opt(common_arg(
{"--slots"},
- string_format("enable slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
- [](common_params & params) {
- params.endpoint_slots = true;
- }
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
- add_opt(common_arg(
{"--no-slots"},
- "disables slots monitoring endpoint",
- [](common_params & params) {
- params.endpoint_slots = false;
+ string_format("expose slots monitoring endpoint (default: %s)", params.endpoint_slots ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.endpoint_slots = value;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_ENDPOINT_SLOTS"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_ENDPOINT_SLOTS"));
add_opt(common_arg(
{"--slot-save-path"}, "PATH",
"path to save slot kv cache (default: disabled)",
}
).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_MAX"));
add_opt(common_arg(
+ {"--models-autoload"},
{"--no-models-autoload"},
- "disables automatic loading of models (default: enabled)",
- [](common_params & params) {
- params.models_autoload = false;
+ string_format("for router server, whether to automatically load models (default: %s)", params.models_autoload ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.models_autoload = value;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_MODELS_AUTOLOAD"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_MODELS_AUTOLOAD"));
add_opt(common_arg(
{"--jinja"},
- string_format("use jinja template for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
- [](common_params & params) {
- params.use_jinja = true;
- }
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
- add_opt(common_arg(
{"--no-jinja"},
- string_format("disable jinja template for chat (default: %s)", params.use_jinja ? "disabled" : "enabled"),
- [](common_params & params) {
- params.use_jinja = false;
+ string_format("whether to use jinja template engine for chat (default: %s)", params.use_jinja ? "enabled" : "disabled"),
+ [](common_params & params, bool value) {
+ params.use_jinja = value;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_NO_JINJA"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER, LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_MTMD}).set_env("LLAMA_ARG_JINJA"));
add_opt(common_arg(
{"--reasoning-format"}, "FORMAT",
"controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:\n"
}
).set_examples({LLAMA_EXAMPLE_COMPLETION, LLAMA_EXAMPLE_CLI, LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_CHAT_TEMPLATE_FILE"));
add_opt(common_arg(
+ {"--prefill-assistant"},
{"--no-prefill-assistant"},
string_format(
"whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)\n"
"when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled\n"
),
- [](common_params & params) {
- params.prefill_assistant = false;
+ [](common_params & params, bool value) {
+ params.prefill_assistant = value;
}
- ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_NO_PREFILL_ASSISTANT"));
+ ).set_examples({LLAMA_EXAMPLE_SERVER}).set_env("LLAMA_ARG_PREFILL_ASSISTANT"));
add_opt(common_arg(
{"-sps", "--slot-prompt-similarity"}, "SIMILARITY",
string_format("how much the prompt of a request must match the prompt of a slot in order to use that slot (default: %.2f, 0.0 = disabled)\n", params.slot_prompt_similarity),
| `--swa-full` | use full-size SWA cache (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/13194#issuecomment-2868343055)<br/>(env: LLAMA_ARG_SWA_FULL) |
| `--kv-unified, -kvu` | use single unified KV buffer for the KV cache of all sequences (default: false)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/14363)<br/>(env: LLAMA_ARG_KV_UNIFIED) |
| `-fa, --flash-attn [on\|off\|auto]` | set Flash Attention use ('on', 'off', or 'auto', default: 'auto')<br/>(env: LLAMA_ARG_FLASH_ATTN) |
-| `--no-perf` | disable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_NO_PERF) |
-| `-e, --escape` | process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
-| `--no-escape` | do not process escape sequences |
+| `--perf, --no-perf` | whether to enable internal libllama performance timings (default: false)<br/>(env: LLAMA_ARG_PERF) |
+| `-e, --escape, --no-escape` | whether to process escapes sequences (\n, \r, \t, \', \", \\) (default: true) |
| `--rope-scaling {none,linear,yarn}` | RoPE frequency scaling method, defaults to linear unless specified by the model<br/>(env: LLAMA_ARG_ROPE_SCALING_TYPE) |
| `--rope-scale N` | RoPE context scaling factor, expands context by a factor of N<br/>(env: LLAMA_ARG_ROPE_SCALE) |
| `--rope-freq-base N` | RoPE base frequency, used by NTK-aware scaling (default: loaded from model)<br/>(env: LLAMA_ARG_ROPE_FREQ_BASE) |
| `--yarn-attn-factor N` | YaRN: scale sqrt(t) or attention magnitude (default: -1.0)<br/>(env: LLAMA_ARG_YARN_ATTN_FACTOR) |
| `--yarn-beta-slow N` | YaRN: high correction dim or alpha (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_SLOW) |
| `--yarn-beta-fast N` | YaRN: low correction dim or beta (default: -1.0)<br/>(env: LLAMA_ARG_YARN_BETA_FAST) |
-| `-nkvo, --no-kv-offload` | disable KV offload<br/>(env: LLAMA_ARG_NO_KV_OFFLOAD) |
-| `-nr, --no-repack` | disable weight repacking<br/>(env: LLAMA_ARG_NO_REPACK) |
-| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_NO_HOST) |
+| `-kvo, --kv-offload, -nkvo, --no-kv-offload` | whether to enable KV cache offloading (default: enabled)<br/>(env: LLAMA_ARG_KV_OFFLOAD) |
+| `--repack, -nr, --no-repack` | whether to enable weight repacking (default: enabled)<br/>(env: LLAMA_ARG_REPACK) |
+| `--no-host` | bypass host buffer allowing extra buffers to be used<br/>(env: LLAMA_ARG_HOST) |
| `-ctk, --cache-type-k TYPE` | KV cache data type for K<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_K) |
| `-ctv, --cache-type-v TYPE` | KV cache data type for V<br/>allowed values: f32, f16, bf16, q8_0, q4_0, q4_1, iq4_nl, q5_0, q5_1<br/>(default: f16)<br/>(env: LLAMA_ARG_CACHE_TYPE_V) |
| `-dt, --defrag-thold N` | KV cache defragmentation threshold (DEPRECATED)<br/>(env: LLAMA_ARG_DEFRAG_THOLD) |
| `-np, --parallel N` | number of parallel sequences to decode (default: 1)<br/>(env: LLAMA_ARG_N_PARALLEL) |
| `--mlock` | force system to keep model in RAM rather than swapping or compressing<br/>(env: LLAMA_ARG_MLOCK) |
-| `--no-mmap` | do not memory-map model (slower load but may reduce pageouts if not using mlock)<br/>(env: LLAMA_ARG_NO_MMAP) |
+| `--mmap, --no-mmap` | whether to memory-map model (if disabled, slower load but may reduce pageouts if not using mlock) (default: enabled)<br/>(env: LLAMA_ARG_MMAP) |
| `--numa TYPE` | attempt optimizations that help on some NUMA systems<br/>- distribute: spread execution evenly over all nodes<br/>- isolate: only spawn threads on CPUs on the node that execution started on<br/>- numactl: use the CPU map provided by numactl<br/>if run without this previously, it is recommended to drop the system page cache before using this<br/>see https://github.com/ggml-org/llama.cpp/issues/1437<br/>(env: LLAMA_ARG_NUMA) |
| `-dev, --device <dev1,dev2,..>` | comma-separated list of devices to use for offloading (none = don't offload)<br/>use --list-devices to see a list of available devices<br/>(env: LLAMA_ARG_DEVICE) |
| `--list-devices` | print list of available devices and exit |
| `-mg, --main-gpu INDEX` | the GPU to use for the model (with split-mode = none), or for intermediate results and KV (with split-mode = row) (default: 0)<br/>(env: LLAMA_ARG_MAIN_GPU) |
| `--check-tensors` | check model tensor data for invalid values (default: false) |
| `--override-kv KEY=TYPE:VALUE` | advanced option to override model metadata by key. may be specified multiple times.<br/>types: int, float, bool, str. example: --override-kv tokenizer.ggml.add_bos_token=bool:false |
-| `--no-op-offload` | disable offloading host tensor operations to device (default: false) |
+| `--op-offload, --no-op-offload` | whether to offload host tensor operations to device (default: true) |
| `--lora FNAME` | path to LoRA adapter (can be repeated to use multiple adapters) |
| `--lora-scaled FNAME SCALE` | path to LoRA adapter with user defined scaling (can be repeated to use multiple adapters) |
| `--control-vector FNAME` | add a control vector<br/>note: this argument can be repeated to add multiple control vectors |
| -------- | ----------- |
| `--ctx-checkpoints, --swa-checkpoints N` | max number of context checkpoints to create per slot (default: 8)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/15293)<br/>(env: LLAMA_ARG_CTX_CHECKPOINTS) |
| `--cache-ram, -cram N` | set the maximum cache size in MiB (default: 8192, -1 - no limit, 0 - disable)<br/>[(more info)](https://github.com/ggml-org/llama.cpp/pull/16391)<br/>(env: LLAMA_ARG_CACHE_RAM) |
-| `--no-context-shift` | disables context shift on infinite text generation (default: enabled)<br/>(env: LLAMA_ARG_NO_CONTEXT_SHIFT) |
-| `--context-shift` | enables context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
+| `--context-shift, --no-context-shift` | whether to use context shift on infinite text generation (default: disabled)<br/>(env: LLAMA_ARG_CONTEXT_SHIFT) |
| `-r, --reverse-prompt PROMPT` | halt generation at PROMPT, return control in interactive mode<br/> |
| `-sp, --special` | special tokens output enabled (default: false) |
-| `--no-warmup` | skip warming up the model with an empty run |
+| `--warmup, --no-warmup` | whether to perform warmup with an empty run (default: enabled) |
| `--spm-infill` | use Suffix/Prefix/Middle pattern for infill (instead of Prefix/Suffix/Middle) as some models prefer this. (default: disabled) |
| `--pooling {none,mean,cls,last,rank}` | pooling type for embeddings, use model default if unspecified<br/>(env: LLAMA_ARG_POOLING) |
-| `-cb, --cont-batching` | enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
-| `-nocb, --no-cont-batching` | disable continuous batching<br/>(env: LLAMA_ARG_NO_CONT_BATCHING) |
+| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
+| `-cb, --cont-batching, -nocb, --no-cont-batching` | whether to enable continuous batching (a.k.a dynamic batching) (default: enabled)<br/>(env: LLAMA_ARG_CONT_BATCHING) |
| `-mm, --mmproj FILE` | path to a multimodal projector file. see tools/mtmd/README.md<br/>note: if -hf is used, this argument can be omitted<br/>(env: LLAMA_ARG_MMPROJ) |
| `-mmu, --mmproj-url URL` | URL to a multimodal projector file. see tools/mtmd/README.md<br/>(env: LLAMA_ARG_MMPROJ_URL) |
-| `--no-mmproj` | explicitly disable multimodal projector, useful when using -hf<br/>(env: LLAMA_ARG_NO_MMPROJ) |
-| `--no-mmproj-offload` | do not offload multimodal projector to GPU<br/>(env: LLAMA_ARG_NO_MMPROJ_OFFLOAD) |
+| `--mmproj-auto, --no-mmproj, --no-mmproj-auto` | whether to use multimodal projector file (if available), useful when using -hf (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_AUTO) |
+| `--mmproj-offload, --no-mmproj-offload` | whether to enable GPU offloading for multimodal projector (default: enabled)<br/>(env: LLAMA_ARG_MMPROJ_OFFLOAD) |
| `--image-min-tokens N` | minimum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MIN_TOKENS) |
| `--image-max-tokens N` | maximum number of tokens each image can take, only used by vision models with dynamic resolution (default: read from model)<br/>(env: LLAMA_ARG_IMAGE_MAX_TOKENS) |
| `--override-tensor-draft, -otd <tensor name pattern>=<buffer type>,...` | override tensor buffer type for draft model |
| `--port PORT` | port to listen (default: 8080)<br/>(env: LLAMA_ARG_PORT) |
| `--path PATH` | path to serve static files from (default: )<br/>(env: LLAMA_ARG_STATIC_PATH) |
| `--api-prefix PREFIX` | prefix path the server serves from, without the trailing slash (default: )<br/>(env: LLAMA_ARG_API_PREFIX) |
-| `--no-webui` | Disable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_NO_WEBUI) |
+| `--webui, --no-webui` | whether to enable the Web UI (default: enabled)<br/>(env: LLAMA_ARG_WEBUI) |
| `--embedding, --embeddings` | restrict to only support embedding use case; use only with dedicated embedding models (default: disabled)<br/>(env: LLAMA_ARG_EMBEDDINGS) |
| `--reranking, --rerank` | enable reranking endpoint on server (default: disabled)<br/>(env: LLAMA_ARG_RERANKING) |
| `--api-key KEY` | API key to use for authentication (default: none)<br/>(env: LLAMA_API_KEY) |
| `--cache-reuse N` | min chunk size to attempt reusing from the cache via KV shifting (default: 0)<br/>[(card)](https://ggml.ai/f0.png)<br/>(env: LLAMA_ARG_CACHE_REUSE) |
| `--metrics` | enable prometheus compatible metrics endpoint (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_METRICS) |
| `--props` | enable changing global properties via POST /props (default: disabled)<br/>(env: LLAMA_ARG_ENDPOINT_PROPS) |
-| `--slots` | enable slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
-| `--no-slots` | disables slots monitoring endpoint<br/>(env: LLAMA_ARG_NO_ENDPOINT_SLOTS) |
+| `--slots, --no-slots` | expose slots monitoring endpoint (default: enabled)<br/>(env: LLAMA_ARG_ENDPOINT_SLOTS) |
| `--slot-save-path PATH` | path to save slot kv cache (default: disabled) |
+| `--media-path PATH` | directory for loading local media files; files can be accessed via file:// URLs using relative paths (default: disabled) |
| `--models-dir PATH` | directory containing models for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_DIR) |
+| `--models-preset PATH` | path to INI file containing model presets for the router server (default: disabled)<br/>(env: LLAMA_ARG_MODELS_PRESET) |
| `--models-max N` | for router server, maximum number of models to load simultaneously (default: 4, 0 = unlimited)<br/>(env: LLAMA_ARG_MODELS_MAX) |
-| `--models-allow-extra-args` | for router server, allow extra arguments for models; important: some arguments can allow users to access local file system, use with caution (default: disabled)<br/>(env: LLAMA_ARG_MODELS_ALLOW_EXTRA_ARGS) |
-| `--no-models-autoload` | disables automatic loading of models (default: enabled)<br/>(env: LLAMA_ARG_NO_MODELS_AUTOLOAD) |
-| `--jinja` | use jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_JINJA) |
-| `--no-jinja` | disable jinja template for chat (default: enabled)<br/><br/>(env: LLAMA_ARG_NO_JINJA) |
+| `--models-autoload, --no-models-autoload` | for router server, whether to automatically load models (default: enabled)<br/>(env: LLAMA_ARG_MODELS_AUTOLOAD) |
+| `--jinja, --no-jinja` | whether to use jinja template engine for chat (default: enabled)<br/>(env: LLAMA_ARG_JINJA) |
| `--reasoning-format FORMAT` | controls whether thought tags are allowed and/or extracted from the response, and in which format they're returned; one of:<br/>- none: leaves thoughts unparsed in `message.content`<br/>- deepseek: puts thoughts in `message.reasoning_content`<br/>- deepseek-legacy: keeps `<think>` tags in `message.content` while also populating `message.reasoning_content`<br/>(default: auto)<br/>(env: LLAMA_ARG_THINK) |
| `--reasoning-budget N` | controls the amount of thinking allowed; currently only one of: -1 for unrestricted thinking budget, or 0 to disable thinking (default: -1)<br/>(env: LLAMA_ARG_THINK_BUDGET) |
| `--chat-template JINJA_TEMPLATE` | set custom jinja chat template (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE) |
| `--chat-template-file JINJA_TEMPLATE_FILE` | set custom jinja chat template file (default: template taken from model's metadata)<br/>if suffix/prefix are specified, template will be disabled<br/>only commonly used templates are accepted (unless --jinja is set before this flag):<br/>list of built-in templates:<br/>bailing, bailing-think, bailing2, chatglm3, chatglm4, chatml, command-r, deepseek, deepseek2, deepseek3, exaone3, exaone4, falcon3, gemma, gigachat, glmedge, gpt-oss, granite, grok-2, hunyuan-dense, hunyuan-moe, kimi-k2, llama2, llama2-sys, llama2-sys-bos, llama2-sys-strip, llama3, llama4, megrez, minicpm, mistral-v1, mistral-v3, mistral-v3-tekken, mistral-v7, mistral-v7-tekken, monarch, openchat, orion, pangu-embedded, phi3, phi4, rwkv-world, seed_oss, smolvlm, vicuna, vicuna-orca, yandex, zephyr<br/>(env: LLAMA_ARG_CHAT_TEMPLATE_FILE) |
-| `--no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_NO_PREFILL_ASSISTANT) |
+| `--prefill-assistant, --no-prefill-assistant` | whether to prefill the assistant's response if the last message is an assistant message (default: prefill enabled)<br/>when this flag is set, if the last message is an assistant message then it will be treated as a full message and not prefilled<br/><br/>(env: LLAMA_ARG_PREFILL_ASSISTANT) |
| `-sps, --slot-prompt-similarity SIMILARITY` | how much the prompt of a request must match the prompt of a slot in order to use that slot (default: 0.10, 0.0 = disabled)<br/> |
| `--lora-init-without-apply` | load LoRA adapters without applying them (apply later via POST /lora-adapters) (default: disabled) |
| `-td, --threads-draft N` | number of threads to use during generation (default: same as --threads) |
Note: If both command line argument and environment variable are both set for the same param, the argument will take precedence over env var.
+For boolean options like `--mmap` or `--kv-offload`, the environment variable is handled as shown in this example:
+- `LLAMA_ARG_MMAP=true` means enabled, other accepted values are: `1`, `on`, `enabled`
+- `LLAMA_ARG_MMAP=false` means disabled, other accepted values are: `0`, `off`, `disabled`
+- If `LLAMA_ARG_NO_MMAP` is present (no matter the value), it means disabling mmap
+
Example usage of docker compose with environment variables:
```yml