llama : add BPE pre-tokenization for Qwen2 (#7114)

author Ren Xuancheng <redacted>

Wed, 8 May 2024 12:06:43 +0000 (20:06 +0800)

committer GitHub <redacted>

Wed, 8 May 2024 12:06:43 +0000 (15:06 +0300)
author Ren Xuancheng <redacted>
Wed, 8 May 2024 12:06:43 +0000 (20:06 +0800)
committer GitHub <redacted>
Wed, 8 May 2024 12:06:43 +0000 (15:06 +0300)
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py

index b51010983ab465c83364fd5768d96eeb17e8ab3d..a26f45a5fb1d096eff5dc50a3cd48fb1c98a994c 100755 (executable)
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -67,6 +67,7 @@ models = [
      {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
      {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
      {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
+    {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
      {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
      {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
  ]
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py

index 8b89575d5a2fc8baf9f2aae63800cdab5b8888b3..454e9fcdd784e5825de16287296d47fb88ea6bcd 100755 (executable)
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -314,6 +314,9 @@ class Model(ABC):
          if chkhsh == "9c2227e4dd922002fb81bde4fc02b0483ca4f12911410dee2255e4987644e3f8":
              # ref: https://huggingface.co/CohereForAI/c4ai-command-r-v01
              res = "command-r"
+        if chkhsh == "e636dc30a262dcc0d8c323492e32ae2b70728f4df7dfe9737d9f920a282b8aea":
+            # ref: https://huggingface.co/Qwen/Qwen1.5-7B
+            res = "qwen2"
          if chkhsh == "b6dc8df998e1cfbdc4eac8243701a65afe638679230920b50d6f17d81c098166":
              # ref: https://huggingface.co/allenai/OLMo-1.7-7B-hf
              res = "olmo"
diff --git a/llama.cpp b/llama.cpp

index 331c9d47243b33542d8d8a0df98516598fbd0a06..9c72d118f2db2d35d6d247cc36f2a70813da3f84 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -4391,6 +4391,9 @@ static void llm_load_vocab(
              } else if (
                  tokenizer_pre == "command-r") {
                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_COMMAND_R;
+            } else if (
+                tokenizer_pre == "qwen2") {
+                vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_QWEN2;
              } else if (
                  tokenizer_pre == "olmo") {
                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_OLMO;
@@ -12263,6 +12266,13 @@ struct llm_tokenizer_bpe {
                              "'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
                          });
                          break;
+                    case LLAMA_VOCAB_PRE_TYPE_QWEN2:
+                        word_collection = unicode_regex_split(text, {
+                            // original regex from tokenizer.json
+                            // "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
+                            "(?:'[sS]|'[tT]|'[rR][eE]|'[vV][eE]|'[mM]|'[lL][lL]|'[dD])|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+",
+                        });
+                        break;
                      default:
                          // default regex for BPE tokenization pre-processing
                          word_collection = unicode_regex_split(text, {
diff --git a/llama.h b/llama.h

index 388839f3995e11169d160f27cc6c1f22e1d3c0e9..0b2e708d06dea73dd5db53cbf57933533d412fc0 100644 (file)
--- a/llama.h
+++ b/llama.h
@@ -81,8 +81,9 @@ extern "C" {
          LLAMA_VOCAB_PRE_TYPE_GPT2           = 7,
          LLAMA_VOCAB_PRE_TYPE_REFACT         = 8,
          LLAMA_VOCAB_PRE_TYPE_COMMAND_R      = 9,
-        LLAMA_VOCAB_PRE_TYPE_OLMO           = 10,
-        LLAMA_VOCAB_PRE_TYPE_DBRX           = 11,
+        LLAMA_VOCAB_PRE_TYPE_QWEN2          = 10,
+        LLAMA_VOCAB_PRE_TYPE_OLMO           = 11,
+        LLAMA_VOCAB_PRE_TYPE_DBRX           = 12,
      };
  
      // note: these values should be synchronized with ggml_rope
diff --git a/models/ggml-vocab-qwen2.gguf b/models/ggml-vocab-qwen2.gguf

new file mode 100644 (file)

index 0000000..541e475

Binary files /dev/null and b/models/ggml-vocab-qwen2.gguf differ
diff --git a/models/ggml-vocab-qwen2.gguf.inp b/models/ggml-vocab-qwen2.gguf.inp

new file mode 100644 (file)

index 0000000..0a89107
--- /dev/null
+++ b/models/ggml-vocab-qwen2.gguf.inp
@@ -0,0 +1,106 @@
+ied 4 ½ months
+__ggml_vocab_test__
+Führer
+__ggml_vocab_test__
+
+__ggml_vocab_test__
+ 
+__ggml_vocab_test__
+  
+__ggml_vocab_test__
+   
+__ggml_vocab_test__
+       
+__ggml_vocab_test__
+
+
+__ggml_vocab_test__
+
+
+
+__ggml_vocab_test__
+
+
+
+
+__ggml_vocab_test__
+       
+
+__ggml_vocab_test__
+Hello world
+__ggml_vocab_test__
+ Hello world
+__ggml_vocab_test__
+Hello World
+__ggml_vocab_test__
+ Hello World
+__ggml_vocab_test__
+ Hello World!
+__ggml_vocab_test__
+Hello, world!
+__ggml_vocab_test__
+ Hello, world!
+__ggml_vocab_test__
+ this is 🦙.cpp
+__ggml_vocab_test__
+w048 7tuijk dsdfhu
+__ggml_vocab_test__
+нещо на Български
+__ggml_vocab_test__
+កាន់តែពិសេសអាចខលចេញ
+__ggml_vocab_test__
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ (only emoji that has its own token)
+__ggml_vocab_test__
+Hello
+__ggml_vocab_test__
+ Hello
+__ggml_vocab_test__
+  Hello
+__ggml_vocab_test__
+   Hello
+__ggml_vocab_test__
+    Hello
+__ggml_vocab_test__
+    Hello
+    Hello
+__ggml_vocab_test__
+ (
+__ggml_vocab_test__
+
+ =
+__ggml_vocab_test__
+' era
+__ggml_vocab_test__
+Hello, y'all! How are you 😁 ?我想在apple工作1314151天～
+__ggml_vocab_test__
+3
+__ggml_vocab_test__
+33
+__ggml_vocab_test__
+333
+__ggml_vocab_test__
+3333
+__ggml_vocab_test__
+33333
+__ggml_vocab_test__
+333333
+__ggml_vocab_test__
+3333333
+__ggml_vocab_test__
+33333333
+__ggml_vocab_test__
+333333333
+__ggml_vocab_test__
+
+ 
+
+ 
+
+
+                               
+  
+   
+    
+     
+🚀 (normal) 😶‍🌫️ (multiple emojis concatenated) ✅ 🦙🦙 3 33 333 3333 33333 333333 3333333 33333333 3.3 3..3 3...3 កាន់តែពិសេសអាច😁 ?我想在apple工作1314151天～ ------======= нещо на Български ''''''```````""""......!!!!!!?????? I've been 'told he's there, 'RE you sure? 'M not sure I'll make it, 'D you like some tea? We'Ve a'lL
+__ggml_vocab_test__
diff --git a/models/ggml-vocab-qwen2.gguf.out b/models/ggml-vocab-qwen2.gguf.out

new file mode 100644 (file)

index 0000000..401a510
--- /dev/null
+++ b/models/ggml-vocab-qwen2.gguf.out
@@ -0,0 +1,43 @@
+ 1122 220 19 220 26062 3951
+ 37 50753 261
+
+ 220
+ 256
+ 262
+ 197
+ 198
+ 271
+ 1406
+ 1572
+ 9707 1879
+ 21927 1879
+ 9707 4337
+ 21927 4337
+ 21927 4337 0
+ 9707 11 1879 0
+ 21927 11 1879 0
+ 419 374 11162 99 247 13 10821
+ 86 15 19 23 220 22 83 1963 41808 11472 2940 16739
+ 78762 14144 1456 13073 63471 33594 3038 133178 79012
+ 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 147805 148301 147270 44258 223 146848
+ 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 320 3243 42365 429 702 1181 1828 3950 8
+ 9707
+ 21927
+ 220 21927
+ 256 21927
+ 262 21927
+ 262 21927 198 262 21927
+ 320
+ 198 284
+ 6 11385
+ 9707 11 379 64848 0 2585 525 498 26525 223 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216
+ 18
+ 18 18
+ 18 18 18
+ 18 18 18 18
+ 18 18 18 18 18
+ 18 18 18 18 18 18
+ 18 18 18 18 18 18 18
+ 18 18 18 18 18 18 18 18
+ 18 18 18 18 18 18 18 18 18
+ 198 4710 14731 65497 7847 1572 2303 78672 10947 145836 320 8252 8 26525 114 378 235 149921 30543 320 35673 99066 97534 8 25521 227 11162 99 247 149955 220 18 220 18 18 220 18 18 18 220 18 18 18 18 220 18 18 18 18 18 220 18 18 18 18 18 18 220 18 18 18 18 18 18 18 220 18 18 18 18 18 18 18 18 220 18 13 18 220 18 496 18 220 18 1112 18 220 146394 97529 241 44258 233 146568 44258 224 147603 20879 115 146280 44258 223 146280 147272 97529 227 144534 937 104100 18493 22377 99257 16 18 16 19 16 20 16 35727 21216 55460 53237 18658 14144 1456 13073 63471 33594 3038 133178 79012 3355 4605 4605 13874 13874 73594 3014 3014 28149 17085 2928 26610 7646 358 3003 1012 364 83 813 566 594 1052 11 364 787 498 2704 30 364 44 537 2704 358 3278 1281 432 11 364 35 498 1075 1045 15243 30 1205 6 42612 264 63866 43
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt

index 208f0cf7b54bc547ff94af825ca78e116ed15204..d409a1d6b42ece8cd803995c99f4260e15909204 100644 (file)
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -84,6 +84,7 @@ llama_test(test-tokenizer-0 NAME test-tokenizer-0-starcoder         ARGS ${CMAKE
  llama_test(test-tokenizer-0 NAME test-tokenizer-0-gpt-2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-gpt-2.gguf)
  llama_test(test-tokenizer-0 NAME test-tokenizer-0-refact            ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-refact.gguf)
  llama_test(test-tokenizer-0 NAME test-tokenizer-0-command-r         ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-command-r.gguf)
+llama_test(test-tokenizer-0 NAME test-tokenizer-0-qwen2             ARGS ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-qwen2.gguf)
  
  # build test-tokenizer-1-bpe target once and add many tests
  add_executable(test-tokenizer-1-bpe test-tokenizer-1-bpe.cpp)
author	Ren Xuancheng <redacted>
	Wed, 8 May 2024 12:06:43 +0000 (20:06 +0800)
committer	GitHub <redacted>
	Wed, 8 May 2024 12:06:43 +0000 (15:06 +0300)
convert-hf-to-gguf-update.py		patch \| blob \| history
convert-hf-to-gguf.py		patch \| blob \| history
llama.cpp		patch \| blob \| history
llama.h		patch \| blob \| history
models/ggml-vocab-qwen2.gguf	[new file with mode: 0644]	patch \| blob
models/ggml-vocab-qwen2.gguf.inp	[new file with mode: 0644]	patch \| blob
models/ggml-vocab-qwen2.gguf.out	[new file with mode: 0644]	patch \| blob
tests/CMakeLists.txt		patch \| blob \| history