scores[token_id] = -1000.0
toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+ tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+ if tokenizer_config_file.is_file():
+ with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+ tokenizer_config_json = json.load(f)
+ added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
+ for token_id, foken_data in added_tokens_decoder.items():
+ token_id = int(token_id)
+ token = foken_data["content"].encode("utf-8")
+ if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
+ assert(tokens[token_id] == token)
+ tokens[token_id] = token
+ scores[token_id] = -1000.0
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+ if foken_data.get("special"):
+ toktypes[token_id] = SentencePieceTokenTypes.CONTROL
+
+ tokenizer_file = self.dir_model / 'tokenizer.json'
+ if tokenizer_file.is_file():
+ with open(tokenizer_file, "r", encoding="utf-8") as f:
+ tokenizer_json = json.load(f)
+ added_tokens = tokenizer_json.get("added_tokens", [])
+ for foken_data in added_tokens:
+ token_id = int(foken_data["id"])
+ token = foken_data["content"].encode("utf-8")
+ if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
+ assert(tokens[token_id] == token)
+ tokens[token_id] = token
+ scores[token_id] = -1000.0
+ toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+ if foken_data.get("special"):
+ toktypes[token_id] = SentencePieceTokenTypes.CONTROL
+
self.gguf_writer.add_tokenizer_model("llama")
self.gguf_writer.add_tokenizer_pre("default")
self.gguf_writer.add_token_list(tokens)
Examples: Prompts
| prompt | n_predict | re_content | n_prompt | n_predicted | truncated |
- | I believe the meaning of life is | 8 | (read\|going)+ | 18 | 8 | not |
- | Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 46 | 64 | not |
+ | I believe the meaning of life is | 8 | (read\|going\|pretty)+ | 18 | 8 | not |
+ | Write a joke about AI from a very long prompt which will not be truncated | 256 | (princesses\|everyone\|kids\|Anna\|forest)+ | 45 | 64 | not |
Scenario: Completion prompt truncated
Given a prompt:
Examples: Prompts
| model | system_prompt | user_prompt | max_tokens | re_content | n_prompt | n_predicted | enable_streaming | truncated |
- | llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 77 | 8 | disabled | not |
- | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|Annabyear)+ | -1 | 64 | enabled | |
+ | llama-2 | Book | What is the best book | 8 | (Here\|what)+ | 76 | 8 | disabled | not |
+ | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128 | (thanks\|happy\|bird\|fireplace)+ | -1 | 64 | enabled | |
Scenario Outline: OAI Compatibility w/ response format
| response_format | n_predicted | re_content |
| {"type": "json_object", "schema": {"const": "42"}} | 5 | "42" |
| {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10 | \[ -300 \] |
- | {"type": "json_object"} | 10 | \{ " Jacky. |
+ | {"type": "json_object"} | 10 | \{ " Saragine. |
Scenario: Tokenize / Detokenize
# Since we have cache, this should only process the last tokens
Given a user prompt "What is the capital of Germany?"
And a completion request with no api error
- Then 24 tokens are predicted matching (Thank|special)
+ Then 24 tokens are predicted matching (Thank|special|Lily)
And 7 prompt tokens are processed
# Loading the original cache into slot 0,
# we should only be processing 1 prompt token and get the same output
Given a user prompt "What is the capital of Germany?"
And using slot id 1
And a completion request with no api error
- Then 24 tokens are predicted matching (Thank|special)
+ Then 24 tokens are predicted matching (Thank|special|Lily)
And 1 prompt tokens are processed
Scenario: Erase Slot
(t.first == "<|eot_id|>" ||
t.first == "<|im_end|>" ||
t.first == "<|end|>" ||
- t.first == "<end_of_turn>"
+ t.first == "<end_of_turn>" ||
+ t.first == "<|endoftext|>"
)
) {
vocab.special_eot_id = t.second;
output.push_back(vocab.special_bos_id);
}
+ static const bool rtrim = true; //TODO: as param
+ bool is_prev_special = false;
+ bool special_token_rtrim = false;
+
for (const auto & fragment : fragment_buffer) {
if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
// without adding this leading whitespace, we do not get the same results as the original tokenizer
// and passing 'add space prefix' as bool argument
//
auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
- if (&fragment == &fragment_buffer.front()) {
- if (vocab.add_space_prefix) {
- raw_text = " " + raw_text; // prefix with space if the first token is not special
+
+ if (special_token_rtrim) {
+ size_t num_whitespaces = 0;
+ while (isspace(raw_text[num_whitespaces])) {
+ num_whitespaces++;
+ }
+ if (num_whitespaces == raw_text.size()) {
+ continue; // skip if all whitespaces
+ }
+ raw_text = raw_text.substr(num_whitespaces);
+ }
+
+ if (vocab.add_space_prefix) {
+ if (!output.size() || is_prev_special) { // prefix with space if first token
+ raw_text = " " + raw_text;
}
}
tokenizer.tokenize(raw_text, output);
} else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
output.push_back(fragment.token);
+ is_prev_special = true;
+ // phi-3 special tokens without rtrim, works fine for llama-spm too
+ special_token_rtrim = rtrim
+ && fragment.token != vocab.special_bos_id
+ && fragment.token != vocab.special_unk_id
+ && fragment.token != vocab.special_eos_id;
}
}
'Ⅵ-a', # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
'\uFEFF//', # unicode_ranges_control, 0xFEFF (BOM)
'Cửa Việt', # llama-3, ignore_merges = true
- '<s>a', # TODO: Phi-3 fail
+ '<s>a', # Phi-3 fail
+ '<unk><|endoftext|><s>' # Phi-3 fail
'a\na', # TODO: Bert fail
]
+def generator_random_special_tokens(special_tokens:list[str], iterations=100) -> Iterator[str]:
+ special_tokens = set(special_tokens)
+ special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"])
+ special_tokens = list(sorted(special_tokens))
+ rand = random.Random()
+ for m in range(iterations):
+ rand.seed(m)
+ words = rand.choices(special_tokens, k=500)
+ yield "".join(words)
+
+
def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
"""Brute force check all vocab words"""
yield from vocab
vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
+ test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer.all_special_tokens, 10_000))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
- test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 10_000))
+ test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000))
# test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_bytes(10_000)) # FAIL
model.free()
if __name__ == "__main__":
- main()
+ # main()
+
+ path_tokenizers = "./models/tokenizers/"
+ path_vocab_format = "./models/ggml-vocab-%s.gguf"
+
+ # import os
+ # tokenizers = os.listdir(path_tokenizers)
+ tokenizers = [
+ "llama-spm", # SPM
+ "phi-3", # SPM
+ ]
+
+ for tokenizer in tokenizers:
+ print("\n" + "=" * 50 + "\n" + tokenizer + "\n") # noqa
+ vocab_file = path_vocab_format % tokenizer
+ dir_tokenizer = path_tokenizers + "/" + tokenizer
+ main([vocab_file, dir_tokenizer, "--verbose"])