string_format("size of the prompt context (default: %d, 0 = loaded from model)", params.n_ctx),
[](common_params & params, int value) {
params.n_ctx = value;
+ if (value == 0) {
+ // disable context reduction in llama_params_fit if the user explicitly requests the full context size:
+ params.fit_params_min_ctx = UINT32_MAX;
+ }
}
).set_env("LLAMA_ARG_CTX_SIZE"));
add_opt(common_arg(
// - returns true if the parameters could be successfully modified to fit device memory
// - this function is NOT thread safe because it modifies the global llama logger state
// - only parameters that have the same value as in llama_default_model_params are modified
+ // with the exception of the context size which is modified if and only if equal to 0
LLAMA_API enum llama_params_fit_status llama_params_fit(
const char * path_model,
struct llama_model_params * mparams,
__func__, hp_nct, cparams->n_ctx, memory_reduction/MiB);
}
} else {
- LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
- __func__, hp_nct, n_ctx_min);
+ if (n_ctx_min == UINT32_MAX) {
+ LLAMA_LOG_INFO("%s: user has requested full context size of %" PRIu32 " -> no change\n", __func__, hp_nct);
+ } else {
+ LLAMA_LOG_INFO("%s: default model context size is %" PRIu32 " which is <= the min. context size of %" PRIu32 " -> no change\n",
+ __func__, hp_nct, n_ctx_min);
+ }
}
} else {
LLAMA_LOG_INFO("%s: context size set by user to %" PRIu32 " -> no change\n", __func__, cparams->n_ctx);
LOG_INF("%s: printing fitted CLI arguments to stdout...\n", __func__);
common_log_flush(common_log_main());
- printf("-c %" PRIu32 " -ngl %" PRIu32, cparams.n_ctx, mparams.n_gpu_layers);
+ printf("-c %" PRIu32 " -ngl %" PRIi32, cparams.n_ctx, mparams.n_gpu_layers);
size_t nd = llama_max_devices();
while (nd > 1 && mparams.tensor_split[nd - 1] == 0.0f) {