server : add_special option for tokenize endpoint (#7059)

author Johan <redacted>

Wed, 8 May 2024 12:27:58 +0000 (14:27 +0200)

committer GitHub <redacted>

Wed, 8 May 2024 12:27:58 +0000 (15:27 +0300)
author Johan <redacted>
Wed, 8 May 2024 12:27:58 +0000 (14:27 +0200)
committer GitHub <redacted>
Wed, 8 May 2024 12:27:58 +0000 (15:27 +0300)
diff --git a/examples/server/README.md b/examples/server/README.md

index a7c3f0b5fc129e1aa9eaa63fdf582c4ff656aa8d..6503179914da01800837e6152662e359cf5a2e1d 100644 (file)
--- a/examples/server/README.md
+++ b/examples/server/README.md
@@ -331,7 +331,7 @@ Notice that each `probs` is an array of length `n_probs`.
  
      `content`: Set the text to tokenize.
  
-    Note that a special `BOS` token is never inserted.
+    `add_special`: Boolean indicating if special tokens, i.e. `BOS`, should be inserted.  Default: `false`
  
  - **POST** `/detokenize`: Convert tokens to text.
  
diff --git a/examples/server/server.cpp b/examples/server/server.cpp

index 85ae1ad9617ebd6bc9cf2828a40288a3aa3648f9..06c0be56749ab3991b2c6e0bcfbedaf4243f0415 100644 (file)
--- a/examples/server/server.cpp
+++ b/examples/server/server.cpp
@@ -3647,7 +3647,8 @@ int main(int argc, char ** argv) {
  
          std::vector<llama_token> tokens;
          if (body.count("content") != 0) {
-            tokens = ctx_server.tokenize(body["content"], false);
+            const bool add_special = json_value(body, "add_special", false);
+            tokens = ctx_server.tokenize(body["content"], add_special);
          }
          const json data = format_tokenizer_response(tokens);
          return res.set_content(data.dump(), "application/json; charset=utf-8");
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature

index 646a4e49d0d56f13c469644eb78b9607c7955878..d21c09135243ae42f0bfa519b0a9c11ab761b2a0 100644 (file)
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -7,6 +7,7 @@ Feature: llama.cpp server
      And   a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
      And   a model file test-model.gguf
      And   a model alias tinyllama-2
+    And   BOS token is 1
      And   42 as server seed
        # KV Cache corresponds to the total amount of tokens
        # that can be stored across all independent sequences: #4130
@@ -91,7 +92,18 @@ Feature: llama.cpp server
      """
      What is the capital of France ?
      """
-    Then tokens can be detokenize
+    Then tokens can be detokenized
+    And  tokens do not begin with BOS
+
+  Scenario: Tokenize w/ BOS
+    Given adding special tokens
+    When  tokenizing:
+    """
+    What is the capital of Germany?
+    """
+    Then  tokens begin with BOS
+    Given first token is removed
+    Then  tokens can be detokenized
  
    Scenario: Models available
      Given available models
diff --git a/examples/server/tests/features/steps/steps.py b/examples/server/tests/features/steps/steps.py

index b8dbef21d1b76838a7fa9fd91ebbea9cb7909dc0..0882a5d36193b9cb8ccec9ac6102c4514bc7355f 100644 (file)
--- a/examples/server/tests/features/steps/steps.py
+++ b/examples/server/tests/features/steps/steps.py
@@ -376,6 +376,11 @@ def step_seed(context, seed):
          context.seed.append(seed)
  
  
+@step('BOS token is {bos:d}')
+def step_bos_token(context, bos):
+    context.bos = bos
+
+
  @step('a prefix prompt')
  def step_prompt_prefix(context):
      context.prompt_prefix = context_text(context)
@@ -656,21 +661,29 @@ async def all_embeddings_are_generated(context):
          assert_embeddings(context.tasks_result.pop().pop())
  
  
+@step('adding special tokens')
+def step_tokenize_set_add_special(context):
+    context.tokenize_add_special = True
+
+
  @step('tokenizing')
  @async_run_until_complete
  async def step_tokenize(context):
      context.tokenized_text = context_text(context)
      async with aiohttp.ClientSession() as session:
+        tokenize_args = {
+            "content": context.tokenized_text,
+        }
+        if getattr(context, 'tokenize_add_special', None) is not None:
+            tokenize_args['add_special'] = context.tokenize_add_special
          async with session.post(f'{context.base_url}/tokenize',
-                                json={
-                                    "content": context.tokenized_text,
-                                }) as response:
+                                json=tokenize_args) as response:
              assert response.status == 200
              tokenize_json = await response.json()
              context.tokens = tokenize_json['tokens']
  
  
-@step('tokens can be detokenize')
+@step('tokens can be detokenized')
  @async_run_until_complete
  async def step_detokenize(context):
      assert len(context.tokens) > 0
@@ -685,6 +698,21 @@ async def step_detokenize(context):
              assert context.tokenized_text == detokenize_json['content'].strip()
  
  
+@step('tokens begin with BOS')
+def step_strings_for_tokenization(context):
+    assert context.tokens[0] == context.bos
+
+
+@step('tokens do not begin with BOS')
+def step_strings_for_tokenization(context):
+    assert context.tokens[0] != context.bos
+
+
+@step('first token is removed')
+def step_strings_for_tokenization(context):
+    context.tokens = context.tokens[1:]
+
+
  @step('an OPTIONS request is sent from {origin}')
  @async_run_until_complete
  async def step_options_request(context, origin):
author	Johan <redacted>
	Wed, 8 May 2024 12:27:58 +0000 (14:27 +0200)
committer	GitHub <redacted>
	Wed, 8 May 2024 12:27:58 +0000 (15:27 +0300)
examples/server/README.md		patch \| blob \| history
examples/server/server.cpp		patch \| blob \| history
examples/server/tests/features/server.feature		patch \| blob \| history
examples/server/tests/features/steps/steps.py		patch \| blob \| history