```json
{
- "default_generation_settings": { ... },
+ "default_generation_settings": {
+ "id": 0,
+ "id_task": -1,
+ "n_ctx": 1024,
+ "speculative": false,
+ "is_processing": false,
+ "params": {
+ "n_predict": -1,
+ "seed": 4294967295,
+ "temperature": 0.800000011920929,
+ "dynatemp_range": 0.0,
+ "dynatemp_exponent": 1.0,
+ "top_k": 40,
+ "top_p": 0.949999988079071,
+ "min_p": 0.05000000074505806,
+ "xtc_probability": 0.0,
+ "xtc_threshold": 0.10000000149011612,
+ "typical_p": 1.0,
+ "repeat_last_n": 64,
+ "repeat_penalty": 1.0,
+ "presence_penalty": 0.0,
+ "frequency_penalty": 0.0,
+ "dry_multiplier": 0.0,
+ "dry_base": 1.75,
+ "dry_allowed_length": 2,
+ "dry_penalty_last_n": -1,
+ "dry_sequence_breakers": [
+ "\n",
+ ":",
+ "\"",
+ "*"
+ ],
+ "mirostat": 0,
+ "mirostat_tau": 5.0,
+ "mirostat_eta": 0.10000000149011612,
+ "penalize_nl": false,
+ "stop": [],
+ "max_tokens": -1,
+ "n_keep": 0,
+ "n_discard": 0,
+ "ignore_eos": false,
+ "stream": true,
+ "n_probs": 0,
+ "min_keep": 0,
+ "grammar": "",
+ "samplers": [
+ "dry",
+ "top_k",
+ "typ_p",
+ "top_p",
+ "min_p",
+ "xtc",
+ "temperature"
+ ],
+ "speculative.n_max": 16,
+ "speculative.n_min": 5,
+ "speculative.p_min": 0.8999999761581421,
+ "timings_per_token": false
+ },
+ "prompt": "",
+ "next_token": {
+ "has_next_token": true,
+ "has_new_line": false,
+ "n_remain": -1,
+ "n_decoded": 0,
+ "stopping_word": ""
+ }
+ },
"total_slots": 1,
- "chat_template": ""
+ "chat_template": "..."
}
```
```json
[
- {
- "dynatemp_exponent": 1.0,
- "dynatemp_range": 0.0,
- "frequency_penalty": 0.0,
- "grammar": "",
- "id": 0,
- "ignore_eos": false,
- "is_processing": false,
- "logit_bias": [],
- "min_p": 0.05000000074505806,
- "mirostat": 0,
- "mirostat_eta": 0.10000000149011612,
- "mirostat_tau": 5.0,
- "model": "llama-2-7b-32k-instruct.Q2_K.gguf",
- "n_ctx": 2048,
- "n_keep": 0,
- "n_predict": 100000,
- "n_probs": 0,
- "next_token": {
- "has_next_token": true,
- "n_remain": -1,
- "n_decoded": 0,
- "stopped_eos": false,
- "stopped_limit": false,
- "stopped_word": false,
- "stopping_word": ""
- },
- "penalize_nl": true,
- "presence_penalty": 0.0,
- "prompt": "Say hello to llama.cpp",
- "repeat_last_n": 64,
- "repeat_penalty": 1.100000023841858,
- "samplers": [
- "top_k",
- "typical_p",
- "top_p",
- "min_p",
- "temperature"
- ],
- "seed": 42,
- "stop": [
- "\n"
- ],
- "stream": false,
- "task_id": 0,
- "temperature": 0.0,
- "top_k": 40,
- "top_p": 0.949999988079071,
- "typical_p": 1.0
+ {
+ "id": 0,
+ "id_task": -1,
+ "n_ctx": 1024,
+ "speculative": false,
+ "is_processing": false,
+ "params": {
+ "n_predict": -1,
+ "seed": 4294967295,
+ "temperature": 0.800000011920929,
+ "dynatemp_range": 0.0,
+ "dynatemp_exponent": 1.0,
+ "top_k": 40,
+ "top_p": 0.949999988079071,
+ "min_p": 0.05000000074505806,
+ "xtc_probability": 0.0,
+ "xtc_threshold": 0.10000000149011612,
+ "typical_p": 1.0,
+ "repeat_last_n": 64,
+ "repeat_penalty": 1.0,
+ "presence_penalty": 0.0,
+ "frequency_penalty": 0.0,
+ "dry_multiplier": 0.0,
+ "dry_base": 1.75,
+ "dry_allowed_length": 2,
+ "dry_penalty_last_n": -1,
+ "dry_sequence_breakers": [
+ "\n",
+ ":",
+ "\"",
+ "*"
+ ],
+ "mirostat": 0,
+ "mirostat_tau": 5.0,
+ "mirostat_eta": 0.10000000149011612,
+ "penalize_nl": false,
+ "stop": [],
+ "max_tokens": -1,
+ "n_keep": 0,
+ "n_discard": 0,
+ "ignore_eos": false,
+ "stream": true,
+ "n_probs": 0,
+ "min_keep": 0,
+ "grammar": "",
+ "samplers": [
+ "dry",
+ "top_k",
+ "typ_p",
+ "top_p",
+ "min_p",
+ "xtc",
+ "temperature"
+ ],
+ "speculative.n_max": 16,
+ "speculative.n_min": 5,
+ "speculative.p_min": 0.8999999761581421,
+ "timings_per_token": false
+ },
+ "prompt": "",
+ "next_token": {
+ "has_next_token": true,
+ "has_new_line": false,
+ "n_remain": -1,
+ "n_decoded": 0,
+ "stopping_word": ""
}
+ }
]
```
struct common_params_sampling sampling;
struct common_params_speculative speculative;
- // params only used in to_json()
- int32_t n_ctx;
- uint32_t seed_cur;
- bool can_speculative;
-
// OAI-compat fields
bool verbose = false;
bool oaicompat = false;
std::string oaicompat_model;
std::string oaicompat_cmpl_id;
- json to_json() {
+ json to_json() const {
std::vector<std::string> samplers;
samplers.reserve(sampling.samplers.size());
for (const auto & sampler : sampling.samplers) {
}
return json {
- {"n_ctx", n_ctx},
{"n_predict", n_predict}, // Server configured n_predict
+ {"seed", sampling.seed},
{"temperature", sampling.temp},
{"dynatemp_range", sampling.dynatemp_range},
{"dynatemp_exponent", sampling.dynatemp_exponent},
{"min_keep", sampling.min_keep},
{"grammar", sampling.grammar},
{"samplers", samplers},
- {"speculative", can_speculative},
{"speculative.n_max", speculative.n_max},
{"speculative.n_min", speculative.n_min},
{"speculative.p_min", speculative.p_min},
return std::vector<json>({initial_ret, second_ret});
}
} else {
- // Some idiosyncrasy in task processing logic makes several trailing calls
- // with empty content, we ignore these at the calee site.
- if (content.empty()) {
- return std::vector<json>({json::object()});
- }
-
choices = json::array({json{
{"finish_reason", nullptr},
{"index", 0},
llama_batch batch_spec = {};
+ llama_context * ctx = nullptr;
llama_context * ctx_dft = nullptr;
common_speculative * spec = nullptr;
t_token_generation, n_decoded, t_gen, n_gen_second,
t_prompt_processing + t_token_generation, n_prompt_tokens_processed + n_decoded);
}
+
+ json to_json() const {
+ return json {
+ {"id", id},
+ {"id_task", id_task},
+ {"n_ctx", n_ctx},
+ {"speculative", can_speculate()},
+ {"is_processing", is_processing()},
+ {"params", params.to_json()},
+ {"prompt", common_detokenize(ctx, prompt_tokens)},
+ {"next_token",
+ {
+ {"has_next_token", has_next_token},
+ {"has_new_line", has_new_line},
+ {"n_remain", n_remaining},
+ {"n_decoded", n_decoded},
+ {"stopping_word", stopping_word},
+ }
+ },
+ };
+ }
};
struct server_metrics {
server_slot slot;
slot.id = i;
+ slot.ctx = ctx;
slot.n_ctx = n_ctx_slot;
slot.n_predict = params_base.n_predict;
slots.push_back(slot);
}
- default_generation_settings_for_props = slots[0].params.to_json();
- default_generation_settings_for_props["seed"] = -1;
+ default_generation_settings_for_props = slots[0].to_json();
// the update_slots() logic will always submit a maximum of n_batch or n_parallel tokens
// note that n_batch can be > n_ctx (e.g. for non-causal attention models such as BERT where the KV cache is not used)
queue_results.send(std::move(res));
}
- void send_partial_response(server_slot & slot, completion_token_output tkn) {
+ void send_partial_response(server_slot & slot, const completion_token_output & tkn) {
auto res = std::make_unique<server_task_result_cmpl_partial>();
- res->id = slot.id_task;
- res->index = slot.index;
- res->content = tkn.text_to_send;
+
+ res->id = slot.id_task;
+ res->index = slot.index;
+ res->content = tkn.text_to_send;
res->truncated = slot.truncated;
res->n_decoded = slot.n_decoded;
res->n_prompt_tokens = slot.n_prompt_tokens;
- res->stop = slot.stop;
+ res->stop = slot.stop;
res->verbose = slot.params.verbose;
res->oaicompat = slot.params.oaicompat;
// populate res.probs_output
if (slot.params.sampling.n_probs > 0) {
const llama_tokens to_send_toks = common_tokenize(ctx, tkn.text_to_send, false);
+
const size_t probs_pos = std::min(slot.n_sent_token_probs, slot.generated_token_probs.size());
const size_t probs_stop_pos = std::min(slot.n_sent_token_probs + to_send_toks.size(), slot.generated_token_probs.size());
void send_final_response(server_slot & slot) {
if (slot.params.stream) {
// if in stream mode, send the last partial response
- return send_partial_response(slot, {0, "", {}});
+ send_partial_response(slot, {0, "", {}});
+ return;
}
auto res = std::make_unique<server_task_result_cmpl_final>();
std::vector<server_task> tasks;
auto create_task = [&](json & task_data, llama_tokens & prompt_tokens) {
SRV_DBG("create task, n_tokens = %d\n", (int) prompt_tokens.size());
+
server_task task;
task.id = queue_tasks.get_new_id();
task.inf_type = inf_type;
int n_processing_slots = 0;
for (server_slot & slot : slots) {
- json slot_data = slot.params.to_json();
- slot_data["id"] = slot.id;
- slot_data["id_task"] = slot.id_task;
- slot_data["is_processing"] = slot.is_processing();
- slot_data["prompt"] = common_detokenize(ctx, slot.prompt_tokens);
- slot_data["next_token"] = {
- {"has_next_token", slot.has_next_token},
- {"has_new_line", slot.has_new_line},
- {"n_remain", slot.n_remaining},
- {"n_decoded", slot.n_decoded},
- {"stopping_word", slot.stopping_word},
- };
+ json slot_data = slot.to_json();
if (slot.is_processing()) {
n_processing_slots++;
auto res = std::make_unique<server_task_result_metrics>();
res->id = task.id;
+ res->slots_data = std::move(slots_data);
res->n_idle_slots = n_idle_slots;
res->n_processing_slots = n_processing_slots;
res->n_tasks_deferred = queue_tasks.queue_tasks_deferred.size();
res.status = 200;
};
- svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, std::exception_ptr ep) {
+ svr->set_exception_handler([&res_error](const httplib::Request &, httplib::Response & res, const std::exception_ptr & ep) {
std::string message;
try {
std::rethrow_exception(ep);
- } catch (std::exception & e) {
+ } catch (const std::exception & e) {
message = e.what();
} catch (...) {
message = "Unknown Exception";