if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
run: |
cd examples/server/tests
+ $env:PYTHONIOENCODING = ":replace"
behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
- name: Slow tests
*Options:*
- `content`: Set the text to tokenize.
+ `content`: (Required) The text to tokenize.
- `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
+ `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted. Default: `false`
+
+ `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs. Default: `false`
+
+**Response:**
+
+Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
+
+
+If `with_pieces` is `false`:
+```json
+{
+ "tokens": [123, 456, 789]
+}
+```
+
+If `with_pieces` is `true`:
+```json
+{
+ "tokens": [
+ {"id": 123, "piece": "Hello"},
+ {"id": 456, "piece": " world"},
+ {"id": 789, "piece": "!"}
+ ]
+}
+```
+
+With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
+```json
+{
+ "tokens": [
+ {"id": 198, "piece": [195]}, // hex C3
+ {"id": 164, "piece": [161]} // hex A1
+ ]
+}
+```
### POST `/detokenize`: Convert tokens to text
const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
const json body = json::parse(req.body);
- std::vector<llama_token> tokens;
+ json tokens_response = json::array();
if (body.count("content") != 0) {
const bool add_special = json_value(body, "add_special", false);
- tokens = ctx_server.tokenize(body.at("content"), add_special);
+ const bool with_pieces = json_value(body, "with_pieces", false);
+ std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
+
+ if (with_pieces) {
+ for (const auto& token : tokens) {
+ std::string piece = llama_token_to_piece(ctx_server.ctx, token);
+ json piece_json;
+
+ // Check if the piece is valid UTF-8
+ if (is_valid_utf8(piece)) {
+ piece_json = piece;
+ } else {
+ // If not valid UTF-8, store as array of byte values
+ piece_json = json::array();
+ for (unsigned char c : piece) {
+ piece_json.push_back(static_cast<int>(c));
+ }
+ }
+
+ tokens_response.push_back({
+ {"id", token},
+ {"piece", piece_json}
+ });
+ }
+ } else {
+ tokens_response = tokens;
+ }
}
- const json data = format_tokenizer_response(tokens);
+
+ const json data = format_tokenizer_response(tokens_response);
res_ok(res, data);
};
Given first token is removed
Then tokens can be detokenized
+ Scenario: Tokenize with pieces
+ When tokenizing with pieces:
+ """
+ What is the capital of Germany?
+ 媽
+ """
+ Then tokens are given with pieces
+
Scenario: Models available
Given available models
Then 1 models are supported
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
import asyncio
import json
import os
context.tokenize_add_special = True
+@step("tokenizing with pieces")
+@async_run_until_complete
+async def step_tokenize_with_pieces(context):
+ context.tokenized_text = context_text(context)
+ async with aiohttp.ClientSession() as session:
+ tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
+ if getattr(context, "tokenize_add_special", None) is not None:
+ tokenize_args["add_special"] = context.tokenize_add_special
+
+ async with session.post(
+ f"{context.base_url}/tokenize", json=tokenize_args
+ ) as response:
+ assert response.status == 200
+ tokenize_json = await response.json()
+ context.tokens_with_pieces = tokenize_json["tokens"]
+
+
+@step("tokens are given with pieces")
+@async_run_until_complete
+async def step_tokenize_with_pieces(context):
+ # Verify that the response contains both token IDs and pieces
+ assert all(
+ "id" in token and "piece" in token for token in context.tokens_with_pieces
+ )
+
+
@step('tokenizing')
@async_run_until_complete
async def step_tokenize(context):
return res;
}
-static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
+static bool is_valid_utf8(const std::string & str) {
+ const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
+ const unsigned char* end = bytes + str.length();
+
+ while (bytes < end) {
+ if (*bytes <= 0x7F) {
+ // 1-byte sequence (0xxxxxxx)
+ bytes++;
+ } else if ((*bytes & 0xE0) == 0xC0) {
+ // 2-byte sequence (110xxxxx 10xxxxxx)
+ if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
+ return false;
+ bytes += 2;
+ } else if ((*bytes & 0xF0) == 0xE0) {
+ // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
+ if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
+ return false;
+ bytes += 3;
+ } else if ((*bytes & 0xF8) == 0xF0) {
+ // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+ if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
+ (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
+ return false;
+ bytes += 4;
+ } else {
+ // Invalid UTF-8 lead byte
+ return false;
+ }
+ }
+
+ return true;
+}
+
+static json format_tokenizer_response(const json & tokens) {
return json {
{"tokens", tokens}
};