Tokenizer SPM fixes for phi-3 and llama-spm (bugfix) (#7425)

author jaime-m-p <redacted>

Tue, 21 May 2024 12:39:48 +0000 (14:39 +0200)

committer GitHub <redacted>

Tue, 21 May 2024 12:39:48 +0000 (14:39 +0200)
author jaime-m-p <redacted>
Tue, 21 May 2024 12:39:48 +0000 (14:39 +0200)
committer GitHub <redacted>
Tue, 21 May 2024 12:39:48 +0000 (14:39 +0200)
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py

index 8937a4981f446b20f458811f4d2d1936a8d081de..1acf45bf2f48e427afdb789cac5a5e5d1e75ac10 100755 (executable)
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -1749,7 +1749,7 @@ class Phi3MiniModel(Model):
                      token_id = int(token_id)
                      token = foken_data["content"].encode("utf-8")
                      if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
-                        assert(tokens[token_id] == token)
+                        assert tokens[token_id] == token
                      tokens[token_id] = token
                      scores[token_id] = -1000.0
                      toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
@@ -1765,7 +1765,7 @@ class Phi3MiniModel(Model):
                      token_id = int(foken_data["id"])
                      token = foken_data["content"].encode("utf-8")
                      if toktypes[token_id] != SentencePieceTokenTypes.UNKNOWN:
-                        assert(tokens[token_id] == token)
+                        assert tokens[token_id] == token
                      tokens[token_id] = token
                      scores[token_id] = -1000.0
                      toktypes[token_id] = SentencePieceTokenTypes.USER_DEFINED
diff --git a/examples/server/tests/features/server.feature b/examples/server/tests/features/server.feature

index 048cfad06bdb5579f47bdcd05afc6b44769b8dd3..d21c09135243ae42f0bfa519b0a9c11ab761b2a0 100644 (file)
--- a/examples/server/tests/features/server.feature
+++ b/examples/server/tests/features/server.feature
@@ -37,8 +37,8 @@ Feature: llama.cpp server
  
      Examples: Prompts
        | prompt                                                                    | n_predict | re_content                                  | n_prompt | n_predicted | truncated |
-      | I believe the meaning of life is                                          | 8         | (read\|going\|pretty)+                      | 18       | 8           | not       |
-      | Write a joke about AI from a very long prompt which will not be truncated | 256       | (princesses\|everyone\|kids\|Anna\|forest)+ | 45       | 64          | not       |
+      | I believe the meaning of life is                                          | 8         | (read\|going)+                              | 18       | 8           | not       |
+      | Write a joke about AI from a very long prompt which will not be truncated | 256       | (princesses\|everyone\|kids\|Anna\|forest)+ | 46       | 64          | not       |
  
    Scenario: Completion prompt truncated
      Given a prompt:
@@ -67,8 +67,8 @@ Feature: llama.cpp server
  
      Examples: Prompts
        | model        | system_prompt               | user_prompt                          | max_tokens | re_content                        | n_prompt | n_predicted | enable_streaming | truncated |
-      | llama-2      | Book                        | What is the best book                | 8          | (Here\|what)+                     | 76       | 8           | disabled         | not       |
-      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128        | (thanks\|happy\|bird\|fireplace)+ | -1       | 64          | enabled          |           |
+      | llama-2      | Book                        | What is the best book                | 8          | (Here\|what)+                     | 77       | 8           | disabled         | not       |
+      | codellama70b | You are a coding assistant. | Write the fibonacci function in c++. | 128        | (thanks\|happy\|bird\|Annabyear)+ | -1       | 64          | enabled          |           |
  
  
    Scenario Outline: OAI Compatibility w/ response format
@@ -84,7 +84,7 @@ Feature: llama.cpp server
        | response_format                                                     | n_predicted | re_content             |
        | {"type": "json_object", "schema": {"const": "42"}}                  | 5           | "42"                   |
        | {"type": "json_object", "schema": {"items": [{"type": "integer"}]}} | 10          | \[ -300 \]             |
-      | {"type": "json_object"}                                             | 10          | \{ " Saragine.         |
+      | {"type": "json_object"}                                             | 10          | \{ " Jacky.            |
  
  
    Scenario: Tokenize / Detokenize
diff --git a/examples/server/tests/features/slotsave.feature b/examples/server/tests/features/slotsave.feature

index ba4ecb6f53ee2a6bae18db09feee5751ec313c24..1c281c0741afe4e119bdb41acd6767b3e44b40e6 100644 (file)
--- a/examples/server/tests/features/slotsave.feature
+++ b/examples/server/tests/features/slotsave.feature
@@ -26,7 +26,7 @@ Feature: llama.cpp server slot management
      # Since we have cache, this should only process the last tokens
      Given a user prompt "What is the capital of Germany?"
      And   a completion request with no api error
-    Then  24 tokens are predicted matching (Thank|special|Lily)
+    Then  24 tokens are predicted matching (Thank|special)
      And   7 prompt tokens are processed
      # Loading the original cache into slot 0,
      # we should only be processing 1 prompt token and get the same output
@@ -41,7 +41,7 @@ Feature: llama.cpp server slot management
      Given a user prompt "What is the capital of Germany?"
      And   using slot id 1
      And   a completion request with no api error
-    Then  24 tokens are predicted matching (Thank|special|Lily)
+    Then  24 tokens are predicted matching (Thank|special)
      And   1 prompt tokens are processed
  
    Scenario: Erase Slot
diff --git a/llama.cpp b/llama.cpp

index e2ebe1752810502fffffcedbff286c8a18dbfc52..d26fe559a2051623ed7f4c5ba180890dab08f3c8 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -12498,15 +12498,16 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
                  // tokenizer.encode('', add_special_tokens=True)  returns [1]
                  // tokenizer.encode('', add_special_tokens=False) returns []
  
+                static const bool rtrim = true;  //TODO: as param
+                bool is_prev_special = false;
+                bool special_token_rtrim = false;
+
                  if (add_special && vocab.special_add_bos != 0) {
                      GGML_ASSERT(vocab.special_bos_id != -1);
                      output.push_back(vocab.special_bos_id);
+                    is_prev_special = true;
                  }
  
-                static const bool rtrim = true;  //TODO: as param
-                bool is_prev_special = false;
-                bool special_token_rtrim = false;
-
                  for (const auto & fragment : fragment_buffer) {
                      if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                          // without adding this leading whitespace, we do not get the same results as the original tokenizer
diff --git a/tests/test-tokenizer-random.py b/tests/test-tokenizer-random.py

index 1166ac1e43bdac8c9ca5f50008cd49af8ad76329..7e1b656e5f5fc3834505ebae64d5cb4af70cfa94 100644 (file)
--- a/tests/test-tokenizer-random.py
+++ b/tests/test-tokenizer-random.py
@@ -154,19 +154,22 @@ def generator_custom_text_edge_cases() -> Iterator[str]:
          '\uFEFF//',   # unicode_ranges_control, 0xFEFF (BOM)
          'Cửa Việt',   # llama-3, ignore_merges = true
          '<s>a',       # Phi-3 fail
-        '<unk><|endoftext|><s>'  # Phi-3 fail
+        '<unk><|endoftext|><s>',  # Phi-3 fail
          'a\na',       # TODO: Bert fail
      ]
  
  
-def generator_random_special_tokens(special_tokens:list[str], iterations=100) -> Iterator[str]:
-    special_tokens = set(special_tokens)
+def generator_random_special_tokens(tokenizer, iterations=100) -> Iterator[str]:
+    special_tokens = set(tokenizer.all_special_tokens)
      special_tokens.update([" ", "\n", "\t", "-", "!", "one", "1", "<s>", "</s>"])
      special_tokens = list(sorted(special_tokens))
      rand = random.Random()
      for m in range(iterations):
          rand.seed(m)
          words = rand.choices(special_tokens, k=500)
+        if tokenizer.add_bos_token:  # skip spam warning of double BOS
+            while words and words[0] == tokenizer.bos_token:
+                words.pop(0)
          yield "".join(words)
  
  
@@ -290,18 +293,19 @@ def main(argv: list[str] = None):
      model = LibLlamaModel(LibLlama(), args.vocab_file, mparams=dict(vocab_only=True), cparams=dict(n_ctx=4096))
      tokenizer = AutoTokenizer.from_pretrained(args.dir_tokenizer)
  
-    def func_tokenize2(text: str):
-        return tokenizer.encode(text, add_special_tokens=False)
-
-    parse_special = all(len(func_tokenize2(t)) == 1 for t in tokenizer.all_special_tokens)
+    tokenizer.add_bos_token = getattr(tokenizer, "add_bos_token", True)
+    tokenizer.add_eos_token = getattr(tokenizer, "add_eos_token", False)
  
      def func_tokenize1(text: str):
-        return model.tokenize(text, add_special=False, parse_special=parse_special)
+        return model.tokenize(text, add_special=True, parse_special=True)
+
+    def func_tokenize2(text: str):
+        return tokenizer.encode(text, add_special_tokens=True)
  
      vocab = list(sorted(tokenizer.batch_decode(list(tokenizer.get_vocab().values()), skip_special_tokens=True)))
      test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text())
      test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_custom_text_edge_cases())
-    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer.all_special_tokens, 10_000))
+    test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_special_tokens(tokenizer, 10_000))
      test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_vocab_words(vocab))
      test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_chars(10_000))
      test_compare_tokenizer(func_tokenize1, func_tokenize2, generator_random_vocab_chars(vocab, 10_000))
author	jaime-m-p <redacted>
	Tue, 21 May 2024 12:39:48 +0000 (14:39 +0200)
committer	GitHub <redacted>
	Tue, 21 May 2024 12:39:48 +0000 (14:39 +0200)
convert-hf-to-gguf.py		patch \| blob \| history
examples/server/tests/features/server.feature		patch \| blob \| history
examples/server/tests/features/slotsave.feature		patch \| blob \| history
llama.cpp		patch \| blob \| history
tests/test-tokenizer-random.py		patch \| blob \| history