Tokenizer SPM fixes for phi-3 and llama-spm (#7375)

author jaime-m-p <redacted>

Mon, 20 May 2024 18:15:57 +0000 (20:15 +0200)

committer GitHub <redacted>

Mon, 20 May 2024 18:15:57 +0000 (20:15 +0200)
author jaime-m-p <redacted>
Mon, 20 May 2024 18:15:57 +0000 (20:15 +0200)
committer GitHub <redacted>
Mon, 20 May 2024 18:15:57 +0000 (20:15 +0200)
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py

index d534b5163bbfd057c14d1f515ffcae9fa080f8a8..8937a4981f446b20f458811f4d2d1936a8d081de 100755 (executable)
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1740,6 +1740,38 @@ class Phi3MiniModel(Model):
                      scores[token_id] = -1000.0
                      toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
  
+        tokenizer_config_file = self.dir_model / 'tokenizer_config.json'
+        if tokenizer_config_file.is_file():
+            with open(tokenizer_config_file, "r", encoding="utf-8") as f:
+                tokenizer_config_json = json.load(f)
+                added_tokens_decoder = tokenizer_config_json.get("added_tokens_decoder", {})
+                for token_id, foken_data in added_tokens_decoder.items():
+                    token_id = int(token_id)
+                    token = foken_data["content"].encode("utf-8")
+                    if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
+                        assert(tokens[token_id] == token)
+                    tokens[token_id] = token
+                    scores[token_id] = -1000.0
+                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+                    if foken_data.get("special"):
+                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
+
+        tokenizer_file = self.dir_model / 'tokenizer.json'
+        if tokenizer_file.is_file():
+            with open(tokenizer_file, "r", encoding="utf-8") as f:
+                tokenizer_json = json.load(f)
+                added_tokens = tokenizer_json.get("added_tokens", [])
+                for foken_data in added_tokens:
+                    token_id = int(foken_data["id"])
+                    token = foken_data["content"].encode("utf-8")
+                    if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
+                        assert(tokens[token_id] == token)
+                    tokens[token_id] = token
+                    scores[token_id] = -1000.0
+                    toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
+                    if foken_data.get("special"):
+                        toktypes[token_id] = SentencePieceTokenTypes.CONTROL
+
          self.gguf_writer.add_tokenizer_model("llama")
          self.gguf_writer.add_tokenizer_pre("default")
          self.gguf_writer.add_token_list(tokens)
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature

index d21c09135243ae42f0bfa519b0a9c11ab761b2a0..048cfad06bdb5579f47bdcd05afc6b44769b8dd3 100644 (file)
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -37,8 +37,8 @@ Feature: llama.cpp server
  
      Examples: Prompts
        | prompt                                                                    | n_predict | re_content                                  | n_prompt | n_predicted | truncated |
-      | I believe the meaning of life is                                          | 8         | (read\|going)+                              | 18       | 8           | not       |
-      | Write a joke about AI from a very long prompt which will not be truncated | 256       | (princesses\|everyone\|kids\|Anna\|forest)+ | 46       | 64          | not       |
+      | I believe the meaning of life is                                          | 8         | (read\|going\|pretty)+                      | 18       | 8           | not       |
+      | Write a joke about AI from a very long prompt which will not be truncated | 256       | (princesses\|everyone\|kids\|Anna\|forest)+ | 45       | 64          | not       |
  
    Scenario: Completion prompt truncated
      Given a prompt:
@@ -67,8 +67,8 @@ Feature: llama.cpp server
  
      Examples: Prompts
        | model        | system_prompt               | user_prompt                          | max_tokens | re_content                        | n_prompt | n_predicted | enable_streaming | truncated |
-      | llama-2      | Book                        | What is the best book                | 8          | (Here\|what)+                     | 77       | 8           | disabled         | not       |
-      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128        | (thanks\|happy\|bird\|Annabyear)+ | -1       | 64          | enabled          |           |
+      | llama-2      | Book                        | What is the best book                | 8          | (Here\|what)+                     | 76       | 8           | disabled         | not       |
+      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128        | (thanks\|happy\|bird\|fireplace)+ | -1       | 64          | enabled          |           |
  
  
    Scenario Outline: OAI Compatibility w/ response format
@@ -84,7 +84,7 @@ Feature: llama.cpp server
        | response_format                                                     | n_predicted | re_content             |
        | {"type": "json_object", "schema": {"const": "42"}}                  | 5           | "42"                   |
        | {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10          | \[ -300 \]             |
-      | {"type": "json_object"}                                             | 10          | \{ " Jacky.            |
+      | {"type": "json_object"}                                             | 10          | \{ " Saragine.         |
  
  
    Scenario: Tokenize / Detokenize
diff --git a/examples/server/tests/features/slotsave.feature b/examples/server/tests/features/slotsave.feature

index 1c281c0741afe4e119bdb41acd6767b3e44b40e6..ba4ecb6f53ee2a6bae18db09feee5751ec313c24 100644 (file)
--- a/examples/server/tests/features/slotsave.feature
+++ b/examples/server/tests/features/slotsave.feature
@@ -26,7 +26,7 @@ Feature: llama.cpp server slot management
      # Since we have cache, this should only process the last tokens
      Given a user prompt "What is the capital of Germany?"
      And   a completion request with no api error
-    Then  24 tokens are predicted matching (Thank|special)
+    Then  24 tokens are predicted matching (Thank|special|Lily)
      And   7 prompt tokens are processed
      # Loading the original cache into slot 0,
      # we should only be processing 1 prompt token and get the same output
@@ -41,7 +41,7 @@ Feature: llama.cpp server slot management
      Given a user prompt "What is the capital of Germany?"
      And   using slot id 1
      And   a completion request with no api error
-    Then  24 tokens are predicted matching (Thank|special)
+    Then  24 tokens are predicted matching (Thank|special|Lily)
      And   1 prompt tokens are processed
  
    Scenario: Erase Slot
diff --git a/llama.cpp b/llama.cpp

index 863961f157e81d805813c844121e7907374c0bb1..e2ebe1752810502fffffcedbff286c8a18dbfc52 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -4553,7 +4553,8 @@ static void llm_load_vocab(
                          (t.first == "<|eot_id|>" ||
                           t.first == "<|im_end|>" ||
                           t.first == "<|end|>" ||
-                         t.first == "<end_of_turn>"
+                         t.first == "<end_of_turn>" ||
+                         t.first == "<|endoftext|>"
                          )
                     ) {
                      vocab.special_eot_id = t.second;
@@ -12502,6 +12503,10 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                      output.push_back(vocab.special_bos_id);
                  }
  
+                static const bool rtrim = true;  //TODO: as param
+                bool is_prev_special = false;
+                bool special_token_rtrim = false;
+
                  for (const auto & fragment : fragment_buffer) {
                      if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                          // without adding this leading whitespace, we do not get the same results as the original tokenizer
@@ -12511,9 +12516,21 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                          //  and passing 'add space prefix' as bool argument
                          //
                          auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
-                        if (&fragment == &fragment_buffer.front()) {
-                            if (vocab.add_space_prefix) {
-                                raw_text = " " + raw_text; // prefix with space if the first token is not special
+
+                        if (special_token_rtrim) {
+                            size_t num_whitespaces = 0;
+                            while (isspace(raw_text[num_whitespaces])) {
+                                num_whitespaces++;
+                            }
+                            if (num_whitespaces == raw_text.size()) {
+                                continue; // skip if all whitespaces
+                            }
+                            raw_text = raw_text.substr(num_whitespaces);
+                        }
+
+                        if (vocab.add_space_prefix) {
+                            if (!output.size() || is_prev_special) {  // prefix with space if first token
+                                raw_text = " " + raw_text;
                              }
                          }
  
@@ -12525,6 +12542,12 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                          tokenizer.tokenize(raw_text, output);
                      } else { // if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_TOKEN)
                          output.push_back(fragment.token);
+                        is_prev_special = true;
+                        // phi-3 special tokens without rtrim, works fine for llama-spm too
+                        special_token_rtrim = rtrim
+                            && fragment.token != vocab.special_bos_id
+                            && fragment.token != vocab.special_unk_id
+                            && fragment.token != vocab.special_eos_id;
                      }
                  }
  
diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py

index d5a6f185fbcd5011bf0de509e2c8a340e4bd4d44..1166ac1e43bdac8c9ca5f50008cd49af8ad76329 100644 (file)
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -153,11 +153,23 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
          'Ⅵ-a',       # unicode_ranges_digit, {0x00002150, 0x0000218F} // Number Forms
          '\uFEFF//',   # unicode_ranges_control, 0xFEFF (BOM)
          'Cửa Việt',   # llama-3, ignore_merges = true
-        '<s>a',       # TODO: Phi-3 fail
+        '<s>a',       # Phi-3 fail
+        '<unk><|endoftext|><s>'  # Phi-3 fail
          'a\na',       # TODO: Bert fail
      ]
  
  
+def generator_random_special_tokens(special_tokens:list[str], iterations=100) -> Iterator[str]:
+    special_tokens = set(special_tokens)
+    special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"])
+    special_tokens = list(sorted(special_tokens))
+    rand = random.Random()
+    for m in range(iterations):
+        rand.seed(m)
+        words = rand.choices(special_tokens, k=500)
+        yield "".join(words)
+
+
  def generator_vocab_words(vocab: list[str]) -> Iterator[str]:
      """Brute force check all vocab words"""
      yield from vocab
@@ -289,14 +301,31 @@ def main(argv: list[str] = None):
      vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
      test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
      test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
+    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer.all_special_tokens, 10_000))
      test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
      test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
      test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
-    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 10_000))
+    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_words(vocab, 5_000))
      # test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_bytes(10_000)) # FAIL
  
      model.free()
  
  
  if __name__ == "__main__":
-    main()
+    # main()
+
+    path_tokenizers = "./models/tokenizers/"
+    path_vocab_format = "./models/ggml-vocab-%s.gguf"
+
+    # import os
+    # tokenizers = os.listdir(path_tokenizers)
+    tokenizers = [
+        "llama-spm",   # SPM
+        "phi-3",       # SPM
+    ]
+
+    for tokenizer in tokenizers:
+        print("\n" + "=" * 50 + "\n" + tokenizer + "\n")  # noqa
+        vocab_file = path_vocab_format % tokenizer
+        dir_tokenizer = path_tokenizers + "/" + tokenizer
+        main([vocab_file, dir_tokenizer, "--verbose"])
author	jaime-m-p <redacted>
	Mon, 20 May 2024 18:15:57 +0000 (20:15 +0200)
committer	GitHub <redacted>
	Mon, 20 May 2024 18:15:57 +0000 (20:15 +0200)
convert-hf-to-gguf.py		patch \| blob \| history
examples/server/tests/features/server.feature		patch \| blob \| history
examples/server/tests/features/slotsave.feature		patch \| blob \| history
llama.cpp		patch \| blob \| history
tests/test-tokenizer-random.py		patch \| blob \| history