int32_t n_decoded = 0;
int32_t n_remaining = -1;
int32_t i_batch = -1;
+ int32_t n_predict = -1;
int32_t num_prompt_tokens = 0;
int32_t num_prompt_tokens_processed = 0;
slot.id = i;
slot.n_ctx = n_ctx_slot;
+ slot.n_predict = params.n_predict;
LOG_TEE(" -> Slot %i - max context: %i\n", slot.id, n_ctx_slot);
slot->sparams.grammar = json_value(data, "grammar", default_sparams.grammar);
slot->sparams.n_probs = json_value(data, "n_probs", default_sparams.n_probs);
+ if (slot->n_predict > 0 && slot->params.n_predict > slot->n_predict) {
+ // Might be better to reject the request with a 400 ?
+ LOG_WARNING("Max tokens to predict exceeds server configuration", {
+ {"params.n_predict", slot->params.n_predict},
+ {"slot.n_predict", slot->n_predict},
+ });
+ slot->params.n_predict = slot->n_predict;
+ }
+
// infill
if (data.count("input_prefix") != 0)
{
return json {
{"n_ctx", slot.n_ctx},
+ {"n_predict", slot.n_predict},
{"model", params.model_alias},
{"seed", slot.params.seed},
{"temperature", slot.sparams.temp},
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf(" --log-disable disables logging to a file.\n");
printf("\n");
+ printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
printf(" --override-kv KEY=TYPE:VALUE\n");
printf(" advanced option to override model metadata by key. may be specified multiple times.\n");
printf(" types: int, float, bool. example: --override-kv tokenizer.ggml.add_bos_token=bool:false\n");
printf(" -gan N, --grp-attn-n N set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`");
printf(" -gaw N, --grp-attn-w N set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`");
printf(" --chat-template FORMAT_NAME");
- printf(" set chat template, possible valus is: llama2, chatml (default %s)", sparams.chat_template.c_str());
+ printf(" set chat template, possible value is: llama2, chatml (default %s)", sparams.chat_template.c_str());
printf("\n");
}