using json = nlohmann::ordered_json;
+//
+// Environment variable utils
+//
+
+template<typename T>
+static typename std::enable_if<std::is_same<T, std::string>::value, void>::type
+get_env(std::string name, T & target) {
+ char * value = std::getenv(name.c_str());
+ target = value ? std::string(value) : target;
+}
+
+template<typename T>
+static typename std::enable_if<!std::is_same<T, bool>::value && std::is_integral<T>::value, void>::type
+get_env(std::string name, T & target) {
+ char * value = std::getenv(name.c_str());
+ target = value ? std::stoi(value) : target;
+}
+
+template<typename T>
+static typename std::enable_if<std::is_floating_point<T>::value, void>::type
+get_env(std::string name, T & target) {
+ char * value = std::getenv(name.c_str());
+ target = value ? std::stof(value) : target;
+}
+
+template<typename T>
+static typename std::enable_if<std::is_same<T, bool>::value, void>::type
+get_env(std::string name, T & target) {
+ char * value = std::getenv(name.c_str());
+ if (value) {
+ std::string val(value);
+ target = val == "1" || val == "true";
+ }
+}
+
//
// CPU utils
//
// CLI argument parsing
//
-void gpt_params_handle_hf_token(gpt_params & params) {
- if (params.hf_token.empty() && std::getenv("HF_TOKEN")) {
- params.hf_token = std::getenv("HF_TOKEN");
- }
-}
-
void gpt_params_handle_model_default(gpt_params & params) {
if (!params.hf_repo.empty()) {
// short-hand to avoid specifying --hf-file -> default it to --model
gpt_params_handle_model_default(params);
- gpt_params_handle_hf_token(params);
+ if (params.hf_token.empty()) {
+ get_env("HF_TOKEN", params.hf_token);
+ }
if (params.escape) {
string_process_escapes(params.prompt);
return true;
}
+void gpt_params_parse_from_env(gpt_params & params) {
+ // we only care about server-related params for now
+ get_env("LLAMA_ARG_MODEL", params.model);
+ get_env("LLAMA_ARG_THREADS", params.n_threads);
+ get_env("LLAMA_ARG_CTX_SIZE", params.n_ctx);
+ get_env("LLAMA_ARG_N_PARALLEL", params.n_parallel);
+ get_env("LLAMA_ARG_BATCH", params.n_batch);
+ get_env("LLAMA_ARG_UBATCH", params.n_ubatch);
+ get_env("LLAMA_ARG_N_GPU_LAYERS", params.n_gpu_layers);
+ get_env("LLAMA_ARG_THREADS_HTTP", params.n_threads_http);
+ get_env("LLAMA_ARG_CHAT_TEMPLATE", params.chat_template);
+ get_env("LLAMA_ARG_N_PREDICT", params.n_predict);
+ get_env("LLAMA_ARG_ENDPOINT_METRICS", params.endpoint_metrics);
+ get_env("LLAMA_ARG_ENDPOINT_SLOTS", params.endpoint_slots);
+ get_env("LLAMA_ARG_EMBEDDINGS", params.embedding);
+ get_env("LLAMA_ARG_FLASH_ATTN", params.flash_attn);
+ get_env("LLAMA_ARG_DEFRAG_THOLD", params.defrag_thold);
+}
+
bool gpt_params_parse(int argc, char ** argv, gpt_params & params) {
const auto params_org = params; // the example can modify the default params
--log-append Don't truncate the old log file.
```
+Available environment variables (if specified, these variables will override parameters specified in arguments):
+
+- `LLAMA_CACHE` (cache directory, used by `--hf-repo`)
+- `HF_TOKEN` (Hugging Face access token, used when accessing a gated model with `--hf-repo`)
+- `LLAMA_ARG_MODEL`
+- `LLAMA_ARG_THREADS`
+- `LLAMA_ARG_CTX_SIZE`
+- `LLAMA_ARG_N_PARALLEL`
+- `LLAMA_ARG_BATCH`
+- `LLAMA_ARG_UBATCH`
+- `LLAMA_ARG_N_GPU_LAYERS`
+- `LLAMA_ARG_THREADS_HTTP`
+- `LLAMA_ARG_CHAT_TEMPLATE`
+- `LLAMA_ARG_N_PREDICT`
+- `LLAMA_ARG_ENDPOINT_METRICS`
+- `LLAMA_ARG_ENDPOINT_SLOTS`
+- `LLAMA_ARG_EMBEDDINGS`
+- `LLAMA_ARG_FLASH_ATTN`
+- `LLAMA_ARG_DEFRAG_THOLD`
## Build