int index = 0;
std::vector<float> embedding;
+ int32_t n_tokens;
+
virtual int get_index() override {
return index;
}
virtual json to_json() override {
return json {
- {"index", index},
- {"embedding", embedding},
+ {"index", index},
+ {"embedding", embedding},
+ {"tokens_evaluated", n_tokens},
};
}
};
int index = 0;
float score = -1e6;
+ int32_t n_tokens;
+
virtual int get_index() override {
return index;
}
virtual json to_json() override {
return json {
- {"index", index},
- {"score", score},
+ {"index", index},
+ {"score", score},
+ {"tokens_evaluated", n_tokens},
};
}
};
auto res = std::make_unique<server_task_result_embd>();
res->id = slot.id_task;
res->index = slot.index;
+ res->n_tokens = slot.n_prompt_tokens;
const int n_embd = llama_n_embd(model);
auto res = std::make_unique<server_task_result_rerank>();
res->id = slot.id_task;
res->index = slot.index;
+ res->n_tokens = slot.n_prompt_tokens;
for (int i = 0; i < batch.n_tokens; ++i) {
if (!batch.logits[i] || batch.seq_id[i][0] != slot.id) {
vi = res.body['data'][i]['embedding']
for x, y in zip(v0, vi):
assert abs(x - y) < EPSILON
+
+
+@pytest.mark.parametrize(
+ "content,n_tokens",
+ [
+ ("I believe the meaning of life is", 7),
+ ("This is a test", 4),
+ ]
+)
+def test_embedding_usage_single(content, n_tokens):
+ global server
+ server.start()
+ res = server.make_request("POST", "/embeddings", data={"input": content})
+ assert res.status_code == 200
+ assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
+ assert res.body['usage']['prompt_tokens'] == n_tokens
+
+
+def test_embedding_usage_multiple():
+ global server
+ server.start()
+ res = server.make_request("POST", "/embeddings", data={
+ "input": [
+ "I believe the meaning of life is",
+ "I believe the meaning of life is",
+ ],
+ })
+ assert res.status_code == 200
+ assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
+ assert res.body['usage']['prompt_tokens'] == 2 * 7
})
assert res.status_code == 400
assert "error" in res.body
+
+
+@pytest.mark.parametrize(
+ "query,doc1,doc2,n_tokens",
+ [
+ ("Machine learning is", "A machine", "Learning is", 19),
+ ("Which city?", "Machine learning is ", "Paris, capitale de la", 26),
+ ]
+)
+def test_rerank_usage(query, doc1, doc2, n_tokens):
+ global server
+ server.start()
+
+ res = server.make_request("POST", "/rerank", data={
+ "query": query,
+ "documents": [
+ doc1,
+ doc2,
+ ]
+ })
+ assert res.status_code == 200
+ assert res.body['usage']['prompt_tokens'] == res.body['usage']['total_tokens']
+ assert res.body['usage']['prompt_tokens'] == n_tokens
static json format_embeddings_response_oaicompat(const json & request, const json & embeddings) {
json data = json::array();
+ int32_t n_tokens = 0;
int i = 0;
for (const auto & elem : embeddings) {
data.push_back(json{
{"index", i++},
{"object", "embedding"}
});
+
+ n_tokens += json_value(elem, "tokens_evaluated", 0);
}
json res = json {
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", "list"},
- {"usage", json { // TODO: fill
- {"prompt_tokens", 0},
- {"total_tokens", 0}
+ {"usage", json {
+ {"prompt_tokens", n_tokens},
+ {"total_tokens", n_tokens}
}},
{"data", data}
};
static json format_response_rerank(const json & request, const json & ranks) {
json data = json::array();
+ int32_t n_tokens = 0;
int i = 0;
for (const auto & rank : ranks) {
data.push_back(json{
{"index", i++},
{"relevance_score", json_value(rank, "score", 0.0)},
});
+
+ n_tokens += json_value(rank, "tokens_evaluated", 0);
}
json res = json {
{"model", json_value(request, "model", std::string(DEFAULT_OAICOMPAT_MODEL))},
{"object", "list"},
- {"usage", json { // TODO: fill
- {"prompt_tokens", 0},
- {"total_tokens", 0}
+ {"usage", json {
+ {"prompt_tokens", n_tokens},
+ {"total_tokens", n_tokens}
}},
{"results", data}
};