`cache_prompt`: Re-use KV cache from a previous request if possible. This way the common prefix does not have to be re-processed, only the suffix that differs between the requests. Because (depending on the backend) the logits are **not** guaranteed to be bit-for-bit identical for different batch sizes (prompt processing vs. token generation) enabling this option can cause nondeterministic results. Default: `true`
+`return_tokens`: Return the raw generated token ids in the `tokens` field. Otherwise `tokens` remains empty. Default: `false`
+
`samplers`: The order the samplers should be applied in. An array of strings representing sampler type names. If a sampler is not set, it will not be used. If a sampler is specified more than once, it will be applied multiple times. Default: `["dry", "top_k", "typ_p", "top_p", "min_p", "xtc", "temperature"]` - these are all the available values.
`timings_per_token`: Include prompt processing and text generation speed information in each response. Default: `false`
**Response format**
-- Note: In streaming mode (`stream`), only `content` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
+- Note: In streaming mode (`stream`), only `content`, `tokens` and `stop` will be returned until end of completion. Responses are sent using the [Server-sent events](https://html.spec.whatwg.org/multipage/server-sent-events.html) standard. Note: the browser's `EventSource` interface cannot be used due to its lack of `POST` request support.
- `completion_probabilities`: An array of token probabilities for each completion. The array's length is `n_predict`. Each item in the array has the following structure:
```json
{
- "content": "<the token selected by the model>",
+ "content": "<the token generated by the model>",
+ "tokens": [ generated token ids if requested ],
"probs": [
{
"prob": float,
Notice that each `probs` is an array of length `n_probs`.
- `content`: Completion result as a string (excluding `stopping_word` if any). In case of streaming mode, will contain the next token as a string.
+- `tokens`: Same as `content` but represented as raw token ids. Only populated if `"return_tokens": true` or `"stream": true` in the request.
- `stop`: Boolean for use with `stream` to check whether the generation has stopped (Note: This is not related to stopping words array `stop` from input options)
- `generation_settings`: The provided options above excluding `prompt` but including `n_ctx`, `model`. These options may differ from the original ones in some way (e.g. bad values filtered out, strings converted to tokens, etc.).
- `model`: The path to the model loaded with `-m`
};
struct slot_params {
- bool stream = true;
- bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
+ bool stream = true;
+ bool cache_prompt = true; // remember the prompt to avoid reprocessing all prompt
+ bool return_tokens = false;
int32_t n_keep = 0; // number of tokens to keep from initial prompt
int32_t n_discard = 0; // number of tokens after n_keep that may be discarded when shifting context, 0 defaults to half
params.stream = json_value(data, "stream", false);
params.cache_prompt = json_value(data, "cache_prompt", true);
+ params.return_tokens = json_value(data, "return_tokens", false);
params.n_predict = json_value(data, "n_predict", json_value(data, "max_tokens", defaults.n_predict));
params.n_indent = json_value(data, "n_indent", defaults.n_indent);
params.n_keep = json_value(data, "n_keep", defaults.n_keep);
struct server_task_result_cmpl_final : server_task_result {
int index = 0;
- std::string content;
+
+ std::string content;
+ llama_tokens tokens;
+
bool stream;
result_timings timings;
std::string prompt;
json res = json {
{"index", index},
{"content", stream ? "" : content}, // in stream mode, content is already in last partial chunk
+ {"tokens", stream ? llama_tokens {} : tokens},
{"id_slot", id_slot},
{"stop", true},
{"model", oaicompat_model},
json choices = json::array({json{
{"finish_reason", finish_reason},
{"index", 0},
- {"message", json{
+ {"message", json {
{"content", content},
- {"role", "assistant"}
+ {"role", "assistant"}
}
}}});
struct server_task_result_cmpl_partial : server_task_result {
int index = 0;
- std::string content;
+
+ std::string content;
+ llama_tokens tokens;
int32_t n_decoded;
int32_t n_prompt_tokens;
json res = json {
{"index", index},
{"content", content},
+ {"tokens", tokens},
{"stop", false},
{"id_slot", id_slot},
{"tokens_predicted", n_decoded},
json second_ret = json{
{"choices", json::array({json{{"finish_reason", nullptr},
{"index", 0},
- {"delta", json{
+ {"delta", json {
{"content", content}}}
}})},
{"created", t},
{"finish_reason", nullptr},
{"index", 0},
{"delta",
- json{
+ json {
{"content", content},
}},
}});
size_t last_nl_pos = 0;
- std::string generated_text;
+ std::string generated_text;
+ llama_tokens generated_tokens;
+
llama_tokens cache_tokens;
+
std::vector<completion_token_output> generated_token_probs;
bool has_next_token = true;
n_sent_token_probs = 0;
task_type = SERVER_TASK_TYPE_COMPLETION;
+ generated_tokens.clear();
generated_token_probs.clear();
}
const std::string token_str = common_token_to_piece(ctx, result.tok, params_base.special);
slot.sampled = result.tok;
- // search stop word and delete it
slot.generated_text += token_str;
+ if (slot.params.return_tokens) {
+ slot.generated_tokens.push_back(result.tok);
+ }
slot.has_next_token = true;
// check if there is incomplete UTF-8 character at the end
break;
}
+ // search stop word and delete it
if (!incomplete) {
size_t pos = std::min(slot.n_sent_text, slot.generated_text.size());
res->id = slot.id_task;
res->index = slot.index;
res->content = tkn.text_to_send;
+ res->tokens = { tkn.tok };
res->n_decoded = slot.n_decoded;
res->n_prompt_tokens = slot.n_prompt_tokens;
res->index = slot.index;
res->content = slot.generated_text;
+ res->tokens = slot.generated_tokens;
res->timings = slot.get_timings();
res->prompt = common_detokenize(ctx, slot.prompt_tokens, true);
global server
server = ServerPreset.tinyllama2()
-@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
- ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False),
- ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False),
+@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated,return_tokens", [
+ ("I believe the meaning of life is", 8, "(going|bed)+", 18, 8, False, False),
+ ("Write a joke about AI from a very long prompt which will not be truncated", 256, "(princesses|everyone|kids|Anna|forest)+", 46, 64, False, True),
])
-def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool):
+def test_completion(prompt: str, n_predict: int, re_content: str, n_prompt: int, n_predicted: int, truncated: bool, return_tokens: bool):
global server
server.start()
res = server.make_request("POST", "/completion", data={
"n_predict": n_predict,
"prompt": prompt,
+ "return_tokens": return_tokens,
})
assert res.status_code == 200
assert res.body["timings"]["prompt_n"] == n_prompt
assert res.body["truncated"] == truncated
assert type(res.body["has_new_line"]) == bool
assert match_regex(re_content, res.body["content"])
+ if return_tokens:
+ assert len(res.body["tokens"]) > 0
+ assert all(type(tok) == int for tok in res.body["tokens"])
+ else:
+ assert res.body["tokens"] == []
@pytest.mark.parametrize("prompt,n_predict,re_content,n_prompt,n_predicted,truncated", [
assert data["generation_settings"]["seed"] == server.seed
assert match_regex(re_content, content)
else:
+ assert len(data["tokens"]) > 0
+ assert all(type(tok) == int for tok in data["tokens"])
content += data["content"]