server : Add option to return token pieces in /tokenize endpoint (#9108)

author Mathijs Henquet <redacted>

Thu, 12 Sep 2024 20:30:11 +0000 (22:30 +0200)

committer GitHub <redacted>

Thu, 12 Sep 2024 20:30:11 +0000 (22:30 +0200)
author Mathijs Henquet <redacted>
Thu, 12 Sep 2024 20:30:11 +0000 (22:30 +0200)
committer GitHub <redacted>
Thu, 12 Sep 2024 20:30:11 +0000 (22:30 +0200)
diff --git a/.github/workflows/server.yml b/.github/workflows/server.yml

index 99feb28f2a54554a2d22028ae98f1c0dcee9b2ed..29f8fd444311952f84e90791e216a560511d8643 100644 (file)
--- a/.github/workflows/server.yml
+++ b/.github/workflows/server.yml
@@ -173,6 +173,7 @@ jobs:
          if: ${{ !matrix.disabled_on_pr || !github.event.pull_request }}
          run: |
            cd examples/server/tests
+          $env:PYTHONIOENCODING = ":replace"
            behave.exe --summary --stop --no-capture --exclude 'issues|wrong_usages|passkey' --tags llama.cpp
  
        - name: Slow tests
diff --git a/examples/server/README.md b/examples/server/README.md

index 79196e9c19451cf3e901391aee2f8d377c2a78db..44a73ca0a10c20ce31022e4c7a29ed62121b6331 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -407,9 +407,44 @@ Notice that each `probs` is an array of length `n_probs`.
  
      *Options:*
  
-    `content`: Set the text to tokenize.
+    `content`: (Required) The text to tokenize.
  
-    `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
+    `add_special`: (Optional) Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
+
+    `with_pieces`: (Optional) Boolean indicating whether to return token pieces along with IDs.  Default: `false`
+
+**Response:**
+
+Returns a JSON object with a `tokens` field containing the tokenization result. The `tokens` array contains either just token IDs or objects with `id` and `piece` fields, depending on the `with_pieces` parameter. The piece field is a string if the piece is valid unicode or a list of bytes otherwise.
+
+
+If `with_pieces` is `false`:
+```json
+{
+  "tokens": [123, 456, 789]
+}
+```
+
+If `with_pieces` is `true`:
+```json
+{
+  "tokens": [
+    {"id": 123, "piece": "Hello"},
+    {"id": 456, "piece": " world"},
+    {"id": 789, "piece": "!"}
+  ]
+}
+```
+
+With input 'á' (utf8 hex: C3 A1) on tinyllama/stories260k
+```json
+{
+  "tokens": [
+    {"id": 198, "piece": [195]}, // hex C3
+    {"id": 164, "piece": [161]} // hex A1
+  ]
+}
+```
  
  ### POST `/detokenize`: Convert tokens to text
  
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 5b263f646979bba453406ea3c900bb9def30be42..5e4dffadf39c5ca7faff0f456bbefa46fe019e88 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3013,12 +3013,39 @@ int main(int argc, char ** argv) {
      const auto handle_tokenize = [&ctx_server, &res_ok](const httplib::Request & req, httplib::Response & res) {
          const json body = json::parse(req.body);
  
-        std::vector<llama_token> tokens;
+        json tokens_response = json::array();
          if (body.count("content") != 0) {
              const bool add_special = json_value(body, "add_special", false);
-            tokens = ctx_server.tokenize(body.at("content"), add_special);
+            const bool with_pieces = json_value(body, "with_pieces", false);
+            std::vector<llama_token> tokens = ctx_server.tokenize(body.at("content"), add_special);
+
+            if (with_pieces) {
+                for (const auto& token : tokens) {
+                    std::string piece = llama_token_to_piece(ctx_server.ctx, token);
+                    json piece_json;
+
+                    // Check if the piece is valid UTF-8
+                    if (is_valid_utf8(piece)) {
+                        piece_json = piece;
+                    } else {
+                        // If not valid UTF-8, store as array of byte values
+                        piece_json = json::array();
+                        for (unsigned char c : piece) {
+                            piece_json.push_back(static_cast<int>(c));
+                        }
+                    }
+
+                    tokens_response.push_back({
+                        {"id", token},
+                        {"piece", piece_json}
+                    });
+                }
+            } else {
+                tokens_response = tokens;
+            }
          }
-        const json data = format_tokenizer_response(tokens);
+
+        const json data = format_tokenizer_response(tokens_response);
          res_ok(res, data);
      };
  
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature

index b55971454afc3674e0a65c6eff348ee9736a3604..15e24c624af37f52419c2ca712c7a8ca71864274 100644 (file)
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -105,6 +105,14 @@ Feature: llama.cpp server
      Given first token is removed
      Then  tokens can be detokenized
  
+  Scenario: Tokenize with pieces
+    When  tokenizing with pieces:
+    """
+    What is the capital of Germany?
+    媽
+    """
+    Then  tokens are given with pieces
+
    Scenario: Models available
      Given available models
      Then  1 models are supported
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py

index 65b71a8e85db12430bdfad860bca4a401073c5b0..11587dd64075a46c9e700b986a1c54633e2eefe3 100644 (file)
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -1,3 +1,6 @@
+#!/usr/bin/env python3
+# -*- coding: utf-8 -*-
+
  import asyncio
  import json
  import os
@@ -697,6 +700,32 @@ def step_tokenize_set_add_special(context):
      context.tokenize_add_special = True
  
  
+@step("tokenizing with pieces")
+@async_run_until_complete
+async def step_tokenize_with_pieces(context):
+    context.tokenized_text = context_text(context)
+    async with aiohttp.ClientSession() as session:
+        tokenize_args = {"content": context.tokenized_text, "with_pieces": True}
+        if getattr(context, "tokenize_add_special", None) is not None:
+            tokenize_args["add_special"] = context.tokenize_add_special
+
+        async with session.post(
+            f"{context.base_url}/tokenize", json=tokenize_args
+        ) as response:
+            assert response.status == 200
+            tokenize_json = await response.json()
+            context.tokens_with_pieces = tokenize_json["tokens"]
+
+
+@step("tokens are given with pieces")
+@async_run_until_complete
+async def step_tokenize_with_pieces(context):
+    # Verify that the response contains both token IDs and pieces
+    assert all(
+        "id" in token and "piece" in token for token in context.tokens_with_pieces
+    )
+
+
  @step('tokenizing')
  @async_run_until_complete
  async def step_tokenize(context):
diff --git a/examples/server/utils.hpp b/examples/server/utils.hpp

index edfce65b634e061ce6faa41e07b232e54891edf2..adb1a1cb968523d1d2d81392a40878addda41d58 100644 (file)
--- a/examples/server/utils.hpp
+++ b/examples/server/utils.hpp
@@ -616,7 +616,40 @@ static json format_embeddings_response_oaicompat(const json & request, const jso
      return res;
  }
  
-static json format_tokenizer_response(const std::vector<llama_token> & tokens) {
+static bool is_valid_utf8(const std::string & str) {
+    const unsigned char* bytes = reinterpret_cast<const unsigned char*>(str.data());
+    const unsigned char* end = bytes + str.length();
+
+    while (bytes < end) {
+        if (*bytes <= 0x7F) {
+            // 1-byte sequence (0xxxxxxx)
+            bytes++;
+        } else if ((*bytes & 0xE0) == 0xC0) {
+            // 2-byte sequence (110xxxxx 10xxxxxx)
+            if (end - bytes < 2 || (bytes[1] & 0xC0) != 0x80)
+                return false;
+            bytes += 2;
+        } else if ((*bytes & 0xF0) == 0xE0) {
+            // 3-byte sequence (1110xxxx 10xxxxxx 10xxxxxx)
+            if (end - bytes < 3 || (bytes[1] & 0xC0) != 0x80 || (bytes[2] & 0xC0) != 0x80)
+                return false;
+            bytes += 3;
+        } else if ((*bytes & 0xF8) == 0xF0) {
+            // 4-byte sequence (11110xxx 10xxxxxx 10xxxxxx 10xxxxxx)
+            if (end - bytes < 4 || (bytes[1] & 0xC0) != 0x80 ||
+                (bytes[2] & 0xC0) != 0x80 || (bytes[3] & 0xC0) != 0x80)
+                return false;
+            bytes += 4;
+        } else {
+            // Invalid UTF-8 lead byte
+            return false;
+        }
+    }
+
+    return true;
+}
+
+static json format_tokenizer_response(const json & tokens) {
      return json {
          {"tokens", tokens}
      };
author	Mathijs Henquet <redacted>
	Thu, 12 Sep 2024 20:30:11 +0000 (22:30 +0200)
committer	GitHub <redacted>
	Thu, 12 Sep 2024 20:30:11 +0000 (22:30 +0200)
.github/workflows/server.yml		patch \| blob \| history
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
examples/server/tests/features/server.feature		patch \| blob \| history
examples/server/tests/features/steps/steps.py		patch \| blob \| history
examples/server/utils.hpp		patch \| blob \| history