std::vector<llama_token> tokens;
if (body.count("content") != 0) {
- tokens = ctx_server.tokenize(body["content"], false);
+ const bool add_special = json_value(body, "add_special", false);
+ tokens = ctx_server.tokenize(body["content"], add_special);
}
const json data = format_tokenizer_response(tokens);
return res.set_content(data.dump(), "application/json; charset=utf-8");
And a model file tinyllamas/stories260K.gguf from HF repo ggml-org/models
And a model file test-model.gguf
And a model alias tinyllama-2
+ And BOS token is 1
And 42 as server seed
# KV Cache corresponds to the total amount of tokens
# that can be stored across all independent sequences: #4130
"""
What is the capital of France ?
"""
- Then tokens can be detokenize
+ Then tokens can be detokenized
+ And tokens do not begin with BOS
+
+ Scenario: Tokenize w/ BOS
+ Given adding special tokens
+ When tokenizing:
+ """
+ What is the capital of Germany?
+ """
+ Then tokens begin with BOS
+ Given first token is removed
+ Then tokens can be detokenized
Scenario: Models available
Given available models
context.seed.append(seed)
+@step('BOS token is {bos:d}')
+def step_bos_token(context, bos):
+ context.bos = bos
+
+
@step('a prefix prompt')
def step_prompt_prefix(context):
context.prompt_prefix = context_text(context)
assert_embeddings(context.tasks_result.pop().pop())
+@step('adding special tokens')
+def step_tokenize_set_add_special(context):
+ context.tokenize_add_special = True
+
+
@step('tokenizing')
@async_run_until_complete
async def step_tokenize(context):
context.tokenized_text = context_text(context)
async with aiohttp.ClientSession() as session:
+ tokenize_args = {
+ "content": context.tokenized_text,
+ }
+ if getattr(context, 'tokenize_add_special', None) is not None:
+ tokenize_args['add_special'] = context.tokenize_add_special
async with session.post(f'{context.base_url}/tokenize',
- json={
- "content": context.tokenized_text,
- }) as response:
+ json=tokenize_args) as response:
assert response.status == 200
tokenize_json = await response.json()
context.tokens = tokenize_json['tokens']
-@step('tokens can be detokenize')
+@step('tokens can be detokenized')
@async_run_until_complete
async def step_detokenize(context):
assert len(context.tokens) > 0
assert context.tokenized_text == detokenize_json['content'].strip()
+@step('tokens begin with BOS')
+def step_strings_for_tokenization(context):
+ assert context.tokens[0] == context.bos
+
+
+@step('tokens do not begin with BOS')
+def step_strings_for_tokenization(context):
+ assert context.tokens[0] != context.bos
+
+
+@step('first token is removed')
+def step_strings_for_tokenization(context):
+ context.tokens = context.tokens[1:]
+
+
@step('an OPTIONS request is sent from {origin}')
@async_run_until_complete
async def step_options_request(context, origin):