struct slot_params {
bool stream = true;
+ bool include_usage = false;
bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
bool return_tokens = false;
bool return_progress = false;
params.verbose = params_base.verbosity > 9;
params.timings_per_token = json_value(data, "timings_per_token", false);
- params.stream = json_value(data, "stream", false);
- params.cache_prompt = json_value(data, "cache_prompt", true);
- params.return_tokens = json_value(data, "return_tokens", false);
- params.return_progress = json_value(data, "return_progress", false);
- params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
- params.n_indent = json_value(data, "n_indent", defaults.n_indent);
- params.n_keep = json_value(data, "n_keep", defaults.n_keep);
- params.n_discard = json_value(data, "n_discard", defaults.n_discard);
- //params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
- params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
- params.response_fields = json_value(data, "response_fields", std::vector<std::string>());
+ params.stream = json_value(data, "stream", false);
+ auto stream_opt = json_value(data, "stream_options", json::object());
+ params.include_usage = json_value(stream_opt, "include_usage", false);
+ params.cache_prompt = json_value(data, "cache_prompt", true);
+ params.return_tokens = json_value(data, "return_tokens", false);
+ params.return_progress = json_value(data, "return_progress", false);
+ params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
+ params.n_indent = json_value(data, "n_indent", defaults.n_indent);
+ params.n_keep = json_value(data, "n_keep", defaults.n_keep);
+ params.n_discard = json_value(data, "n_discard", defaults.n_discard);
+ //params.t_max_prompt_ms = json_value(data, "t_max_prompt_ms", defaults.t_max_prompt_ms); // TODO: implement
+ params.t_max_predict_ms = json_value(data, "t_max_predict_ms", defaults.t_max_predict_ms);
+ params.response_fields = json_value(data, "response_fields", std::vector<std::string>());
params.sampling.top_k = json_value(data, "top_k", defaults.sampling.top_k);
params.sampling.top_p = json_value(data, "top_p", defaults.sampling.top_p);
llama_tokens tokens;
bool stream;
+ bool include_usage;
result_timings timings;
std::string prompt;
{"object", "chat.completion.chunk"},
});
- // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
- // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
- deltas.push_back({
- {"choices", json::array()},
- {"created", t},
- {"id", oaicompat_cmpl_id},
- {"model", oaicompat_model},
- {"system_fingerprint", build_info},
- {"object", "chat.completion.chunk"},
- {"usage", json {
- {"completion_tokens", n_decoded},
- {"prompt_tokens", n_prompt_tokens},
- {"total_tokens", n_decoded + n_prompt_tokens},
- }},
- });
+ if (include_usage) {
+ // OpenAI API spec for chat.completion.chunks specifies an empty `choices` array for the last chunk when including usage
+ // https://platform.openai.com/docs/api-reference/chat_streaming/streaming#chat_streaming/streaming-choices
+ deltas.push_back({
+ {"choices", json::array()},
+ {"created", t},
+ {"id", oaicompat_cmpl_id},
+ {"model", oaicompat_model},
+ {"system_fingerprint", build_info},
+ {"object", "chat.completion.chunk"},
+ {"usage", json {
+ {"completion_tokens", n_decoded},
+ {"prompt_tokens", n_prompt_tokens},
+ {"total_tokens", n_decoded + n_prompt_tokens},
+ }},
+ });
+ }
if (timings.prompt_n >= 0) {
deltas.back().push_back({"timings", timings.to_json()});
res->verbose = slot.params.verbose;
res->stream = slot.params.stream;
+ res->include_usage = slot.params.include_usage;
res->oaicompat = slot.params.oaicompat;
res->oaicompat_model = slot.params.oaicompat_model;
res->oaicompat_cmpl_id = slot.params.oaicompat_cmpl_id;