llama : lookup word in vocab before doing BPE merges (#7193)

author Haoxiang Fei <redacted>

Sat, 11 May 2024 08:12:06 +0000 (16:12 +0800)

committer GitHub <redacted>

Sat, 11 May 2024 08:12:06 +0000 (11:12 +0300)
author Haoxiang Fei <redacted>
Sat, 11 May 2024 08:12:06 +0000 (16:12 +0800)
committer GitHub <redacted>
Sat, 11 May 2024 08:12:06 +0000 (11:12 +0300)
diff --git a/llama.cpp b/llama.cpp

index cdff28cdaa7737b2bda610400d0d8b1ae20f5b27..e91ad7285da9945b484661022b6b086bb5162102 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -12253,13 +12253,14 @@ struct llm_tokenizer_bpe {
  
      void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
          int final_prev_index = -1;
+        bool ignore_merges = false;
  
          std::vector<std::string> word_collection;
          switch (vocab.type) {
              case LLAMA_VOCAB_TYPE_BPE:
                  switch (vocab.type_pre) {
                      case LLAMA_VOCAB_PRE_TYPE_LLAMA3:
-                    case LLAMA_VOCAB_PRE_TYPE_DBRX:
+                        ignore_merges = true;
                          word_collection = unicode_regex_split(text, {
                              // original regex from tokenizer.json
                              //"(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
@@ -12268,6 +12269,12 @@ struct llm_tokenizer_bpe {
                              "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
                          });
                          break;
+                    case LLAMA_VOCAB_PRE_TYPE_DBRX:
+                        word_collection = unicode_regex_split(text, {
+                            // same as llama3
+                            "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                        });
+                        break;
                      case LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM:
                          word_collection = unicode_regex_split(text, {
                              "[\r\n]",
@@ -12351,6 +12358,11 @@ struct llm_tokenizer_bpe {
              int index = 0;
              size_t offset = 0;
  
+            if (ignore_merges && vocab.token_to_id.find(word) != vocab.token_to_id.end()) {
+                symbols.emplace_back(llm_symbol{-1, -1, word.c_str(), word.size()});
+                offset = word.size();
+            }
+
              while (offset < word.size()) {
                  llm_symbol sym;
                  size_t char_len = std::min(word.size() - offset, (size_t) ::utf8_len(word[offset]));
diff --git a/models/ggml-vocab-llama-bpe.gguf.inp b/models/ggml-vocab-llama-bpe.gguf.inp

index 0a89107c60d7f6728d9a472948adff5cc6c9ec70..9380bf355202ab4f16b7ad0e5464ba504f098de7 100644 (file)
--- a/models/ggml-vocab-llama-bpe.gguf.inp
+++ b/models/ggml-vocab-llama-bpe.gguf.inp
@@ -104,3 +104,5 @@ __ggml_vocab_test__
       
  🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
  __ggml_vocab_test__
+ Việt
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-llama-bpe.gguf.out b/models/ggml-vocab-llama-bpe.gguf.out

index 1f00e3812e227221b0425e84b27ef0425cb4bab0..1f3607fb6a3785883d5d29b2900b6a361ff5f6d7 100644 (file)
--- a/models/ggml-vocab-llama-bpe.gguf.out
+++ b/models/ggml-vocab-llama-bpe.gguf.out
@@ -41,3 +41,4 @@
   8765 8765 1644
   8765 8765 8765
   198 4815 15073 66597 8004 1602 2355 79772 11187 9468 248 222 320 8416 8 27623 114 102470 9468 234 104 31643 320 36773 100166 98634 8 26602 227 11410 99 247 9468 99 247 220 18 220 1644 220 8765 220 8765 18 220 8765 1644 220 8765 8765 220 8765 8765 18 220 8765 8765 1644 220 18 13 18 220 18 497 18 220 18 1131 18 220 21549 222 98629 241 45358 233 21549 237 45358 224 21549 244 21549 115 21549 253 45358 223 21549 253 21549 95 98629 227 76460 223 949 37046 101067 19000 23182 102301 9263 18136 16 36827 21909 56560 54337 19175 102118 13373 64571 34694 3114 112203 80112 3436 106451 14196 14196 74694 3089 3089 29249 17523 3001 27708 7801 358 3077 1027 364 83 820 568 596 1070 11 364 793 499 2771 30 364 44 539 2771 358 3358 1304 433 11 364 35 499 1093 1063 15600 30 1226 6 43712 264 64966 43
+ 101798
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt

index d409a1d6b42ece8cd803995c99f4260e15909204..766a017524237025b3bddbca48403c2815b93455 100644 (file)
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -92,7 +92,7 @@ target_link_libraries(test-tokenizer-1-bpe PRIVATE common)
  install(TARGETS test-tokenizer-1-bpe RUNTIME)
  
  # TODO: disabled due to slowness
-#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf)
+#llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-llama-bpe ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-llama-bpe.gguf --ignore-merges)
  #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-falcon    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
  #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-aquila    ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
  #llama_test(test-tokenizer-1-bpe NAME test-tokenizer-1-mpt       ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp

index a0e2caf9427eb7c59487c4e1440db5a1f953cb82..209a04ad6f77ad1840d4ad5d87d5ade8a5705b48 100644 (file)
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -13,15 +13,27 @@
  #include <vector>
  
  int main(int argc, char **argv) {
-    if (argc < 2) {
-        fprintf(stderr, "Usage: %s <vocab-file>\n", argv[0]);
+    if (argc < 2 || argc > 3) {
+        fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
          return 1;
      }
  
      const std::string fname = argv[1];
+    bool ignore_merges = false;
+    if (argc == 3) {
+        if (std::strcmp(argv[2], "--ignore-merges") != 0) {
+            fprintf(stderr, "Usage: %s <vocab-file> [--ignore-merges]\n", argv[0]);
+            return 1;
+        }
+        ignore_merges = true;
+    }
  
      fprintf(stderr, "%s : reading vocab from: '%s'\n", __func__, fname.c_str());
  
+    if (ignore_merges) {
+        fprintf(stderr, "%s : ignoring merges for tokens inside vocab\n", __func__);
+    }
+
      llama_model * model;
      llama_context * ctx;
  
@@ -65,7 +77,19 @@ int main(int argc, char **argv) {
          std::string str = llama_detokenize_bpe(ctx, std::vector<int>(1, i));
          try {
              auto cps = unicode_cpts_from_utf8(str);
-            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
+            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false, true);
+            if (ignore_merges && tokens.size() > 1) {
+                fprintf(stderr,
+                        "%s : error: token %d detokenizes to '%s'(%zu) but "
+                        "tokenization of this to multiple tokens: [",
+                        __func__, i, str.c_str(), str.length());
+                fprintf(stderr, "%d", tokens[0]);
+                for (size_t i = 1; i < tokens.size(); i++) {
+                    fprintf(stderr, ", %d", tokens[i]);
+                }
+                fprintf(stderr, "]\n");
+                return 2;
+            }
              std::string check = llama_detokenize_bpe(ctx, tokens);
              if (check != str) {
                  fprintf(stderr, "%s : error: token %d detokenizes to '%s'(%zu) but tokenization of this detokenizes to '%s'(%zu)\n",
author	Haoxiang Fei <redacted>
	Sat, 11 May 2024 08:12:06 +0000 (16:12 +0800)
committer	GitHub <redacted>
	Sat, 11 May 2024 08:12:06 +0000 (11:12 +0300)
llama.cpp		patch \| blob \| history
models/ggml-vocab-llama-bpe.gguf.inp		patch \| blob \| history
models/ggml-vocab-llama-bpe.gguf.out		patch \| blob \| history
tests/CMakeLists.txt		patch \| blob \| history
tests/test-tokenizer-1-bpe.cpp		patch \| blob \| history