- `--grp-attn-n`: Set the group attention factor to extend context size through self-extend(default: 1=disabled), used together with group attention width `--grp-attn-w`
- `--grp-attn-w`: Set the group attention width to extend context size through self-extend(default: 512), used together with group attention factor `--grp-attn-n`
- `-n, --n-predict`: Set the maximum tokens to predict (default: -1)
+- `--slots-endpoint-disable`: To disable slots state monitoring endpoint. Slots state may contain user data, prompts included.
## Build
}'
```
+- **GET** `/slots`: Returns the current slots processing state. Can be disabled with `--slots-endpoint-disable`.
+
+### Result JSON
+
+```json
+[
+ {
+ "dynatemp_exponent": 1.0,
+ "dynatemp_range": 0.0,
+ "frequency_penalty": 0.0,
+ "grammar": "",
+ "id": 0,
+ "ignore_eos": false,
+ "logit_bias": [],
+ "min_p": 0.05000000074505806,
+ "mirostat": 0,
+ "mirostat_eta": 0.10000000149011612,
+ "mirostat_tau": 5.0,
+ "model": "llama-2-7b-32k-instruct.Q2_K.gguf",
+ "n_ctx": 2048,
+ "n_keep": 0,
+ "n_predict": 100000,
+ "n_probs": 0,
+ "next_token": {
+ "has_next_token": true,
+ "n_remain": -1,
+ "num_tokens_predicted": 0,
+ "stopped_eos": false,
+ "stopped_limit": false,
+ "stopped_word": false,
+ "stopping_word": ""
+ },
+ "penalize_nl": true,
+ "penalty_prompt_tokens": [],
+ "presence_penalty": 0.0,
+ "prompt": "Say hello to llama.cpp",
+ "repeat_last_n": 64,
+ "repeat_penalty": 1.100000023841858,
+ "samplers": [
+ "top_k",
+ "tfs_z",
+ "typical_p",
+ "top_p",
+ "min_p",
+ "temperature"
+ ],
+ "seed": 42,
+ "state": 1,
+ "stop": [
+ "\n"
+ ],
+ "stream": false,
+ "task_id": 0,
+ "temperature": 0.0,
+ "tfs_z": 1.0,
+ "top_k": 40,
+ "top_p": 0.949999988079071,
+ "typical_p": 1.0,
+ "use_penalty_prompt_tokens": false
+ }
+]
+```
+
## More examples
### Change system prompt on runtime
int32_t port = 8080;
int32_t read_timeout = 600;
int32_t write_timeout = 600;
+ bool slots_endpoint = true;
};
bool server_verbose = false;
printf(" set a file to load a system prompt (initial prompt of all slots), this is useful for chat applications.\n");
printf(" --mmproj MMPROJ_FILE path to a multimodal projector file for LLaVA.\n");
printf(" --log-disable disables logging to a file.\n");
+ printf(" --slots-endpoint-disable disables slots monitoring endpoint.\n");
printf("\n");
printf(" -n, --n-predict maximum tokens to predict (default: %d)\n", params.n_predict);
printf(" --override-kv KEY=TYPE:VALUE\n");
log_set_target(stdout);
LOG_INFO("logging to file is disabled.", {});
}
+ else if (arg == "--slots-endpoint-disable")
+ {
+ sparams.slots_endpoint = false;
+ }
else if (arg == "--chat-template")
{
if (++i >= argc)
}
});
+ if (sparams.slots_endpoint) {
+ svr.Get("/slots", [&](const httplib::Request&, httplib::Response& res) {
+ json slots;
+ for (llama_client_slot & slot : llama.slots) {
+ json slot_data = llama.get_formated_generation(slot);
+ slot_data["id"] = slot.id;
+ slot_data["task_id"] = slot.task_id;
+ slot_data["state"] = slot.state;
+ slot_data["prompt"] = slot.prompt;
+ slot_data["next_token"] = {
+ {"has_next_token", slot.has_next_token},
+ {"n_remain", slot.n_remaining},
+ {"num_tokens_predicted", slot.n_decoded},
+ {"stopped_eos", slot.stopped_eos},
+ {"stopped_word", slot.stopped_word},
+ {"stopped_limit", slot.stopped_limit},
+ {"stopping_word", slot.stopping_word},
+ };
+
+ slots.push_back(slot_data);
+ }
+ res.set_content(slots.dump(), "application/json");
+ res.status = 200; // HTTP OK
+ });
+ }
+
svr.set_logger(log_server_request);
svr.set_exception_handler([](const httplib::Request &, httplib::Response &res, std::exception_ptr ep)