bool penalize_nl = false; // consider newlines as a repeatable token
bool ignore_eos = false;
bool no_perf = false; // disable performance metrics
+ bool timing_per_token = false;
std::vector<std::string> dry_sequence_breakers = {"\n", ":", "\"", "*"}; // default sequence breakers for DRY
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
+ `timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false`
+
**Response format**
- Note: When using streaming mode (`stream`), only `content` and `stop` will be returned until end of completion.
bool stopped_word = false;
bool stopped_limit = false;
+ bool timings_per_token = false;
+
bool oaicompat = false;
std::string oaicompat_model;
slot.oaicompat_model = "";
}
+ slot.timings_per_token = json_value(data, "timings_per_token", false);
+
slot.params.stream = json_value(data, "stream", false);
slot.params.cache_prompt = json_value(data, "cache_prompt", true);
slot.params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
{"speculative.n_max", slot.params.speculative.n_max},
{"speculative.n_min", slot.params.speculative.n_min},
{"speculative.p_min", slot.params.speculative.p_min},
+ {"timings_per_token", slot.timings_per_token},
};
}
res.data["model"] = slot.oaicompat_model;
}
+ if (slot.timings_per_token) {
+ res.data["timings"] = slot.get_formated_timings();
+ }
+
queue_results.send(res);
}
common_sampler_accept(slot.smpl, id, true);
slot.n_decoded += 1;
+
+ const int64_t t_current = ggml_time_us();
+
if (slot.n_decoded == 1) {
- slot.t_start_generation = ggml_time_us();
+ slot.t_start_generation = t_current;
slot.t_prompt_processing = (slot.t_start_generation - slot.t_start_process_prompt) / 1e3;
metrics.on_prompt_eval(slot);
}
+ slot.t_token_generation = (t_current - slot.t_start_generation) / 1e3;
+
completion_token_output result;
result.tok = id;
})
assert res.status_code == 400 or res.status_code == 500
assert "error" in res.body
+
+
+def test_chat_completion_with_timings_per_token():
+ global server
+ server.start()
+ res = server.make_stream_request("POST", "/chat/completions", data={
+ "max_tokens": 10,
+ "messages": [{"role": "user", "content": "test"}],
+ "stream": True,
+ "timings_per_token": True,
+ })
+ for data in res:
+ assert "timings" in data
+ assert "prompt_per_second" in data["timings"]
+ assert "predicted_per_second" in data["timings"]
+ assert "predicted_n" in data["timings"]
+ assert data["timings"]["predicted_n"] <= 10
res["completion_probabilities"] = json_value(result, "completion_probabilities", json::array());
}
+ if (result.contains("timings")) {
+ res.push_back({"timings", json_value(result, "timings", json::object())});
+ }
+
return res;
}
{"model", modelname},
{"object", "chat.completion.chunk"}
};
+
+ if (result.contains("timings")) {
+ ret.push_back({"timings", json_value(result, "timings", json::object())});
+ }
+
if (!finish_reason.empty()) {
int num_tokens_predicted = json_value(result, "tokens_predicted", 0);
int num_prompt_tokens = json_value(result, "tokens_evaluated", 0);