Add test for MPT tokenization (#3728)

author goerch <redacted>

Sun, 22 Oct 2023 19:21:42 +0000 (21:21 +0200)

committer GitHub <redacted>

Sun, 22 Oct 2023 19:21:42 +0000 (21:21 +0200)
author goerch <redacted>
Sun, 22 Oct 2023 19:21:42 +0000 (21:21 +0200)
committer GitHub <redacted>
Sun, 22 Oct 2023 19:21:42 +0000 (21:21 +0200)
diff --git a/convert-mpt-hf-to-gguf.py b/convert-mpt-hf-to-gguf.py

index 21b9fd5071ba61e90c786d009af515937e536c37..2d2fa2329dd1bc4858f704227e9a134db7b8ca19 100755 (executable)
--- a/convert-mpt-hf-to-gguf.py
+++ b/convert-mpt-hf-to-gguf.py
@@ -128,15 +128,22 @@ vocab_size = hparams["vocab_size"]
  # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
  tokenizer = AutoTokenizer.from_pretrained(dir_model)
  
+added_vocab = tokenizer.get_added_vocab()
  reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
  
  for i in range(vocab_size):
-    tokens.append(reverse_vocab[i] if i in reverse_vocab else f"[PAD{i}]")
-    scores.append(0.0) # dummy
-    toktypes.append(gguf.TokenType.NORMAL)
+    if i not in reverse_vocab:
+        tokens.append(f"[PAD{i}]")
+        toktypes.append(gguf.TokenType.USER_DEFINED)
+    elif reverse_vocab[i] in added_vocab:
+        # NOTE: wouldn't we like to distinguish CONTROL tokens here?
+        tokens.append(reverse_vocab[i])
+        toktypes.append(gguf.TokenType.USER_DEFINED)
+    else:
+        tokens.append(reverse_vocab[i])
+        toktypes.append(gguf.TokenType.NORMAL)
  
  gguf_writer.add_token_list(tokens)
-gguf_writer.add_token_scores(scores)
  gguf_writer.add_token_types(toktypes)
  
  special_vocab = gguf.SpecialVocab(dir_model, load_merges = True, n_vocab = len(tokens))
diff --git a/llama.cpp b/llama.cpp

index 8d52eaf62ef8771fda0df0371566a43191403e79..c63e6251c76769f4301314523b1abd1ae1402d9d 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -975,14 +975,15 @@ static void llama_nop(struct ggml_tensor * tensor) { // don't offload by default
      (void) tensor;
  }
  
-static std::string llama_token_to_str(const struct llama_context * ctx, llama_token token) {
+static std::string llama_token_to_piece(const struct llama_context * ctx, llama_token token) {
      std::vector<char> result(8, 0);
      const int n_tokens = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
      if (n_tokens < 0) {
          result.resize(-n_tokens);
          int check = llama_token_to_piece(llama_get_model(ctx), token, result.data(), result.size());
          GGML_ASSERT(check == -n_tokens);
-    } else {
+    }
+    else {
          result.resize(n_tokens);
      }
  
@@ -1202,10 +1203,10 @@ struct llama_vocab {
      id special_eot_id    = 32010;
  
      int find_bpe_rank(std::string token_left, std::string token_right) const {
-        replace_all(token_left,  " ",  "\u0120");
-        replace_all(token_left,  "\n", "\u010A");
-        replace_all(token_right, " ",  "\u0120");
-        replace_all(token_right, "\n", "\u010A");
+        GGML_ASSERT(token_left.find(" ") == std::string::npos);
+        GGML_ASSERT(token_left.find("\n") == std::string::npos);
+        GGML_ASSERT(token_right.find(" ") == std::string::npos);
+        GGML_ASSERT(token_right.find("\n") == std::string::npos);
  
          auto it = bpe_ranks.find(std::make_pair(token_left, token_right));
          if (it == bpe_ranks.end()) {
@@ -7499,7 +7500,7 @@ void llama_sample_grammar(struct llama_context * ctx, llama_token_data_array * c
  
      for (size_t i = 0; i < candidates->size; ++i) {
          const llama_token id    = candidates->data[i].id;
-        const std::string piece = llama_token_to_str(ctx, id);
+        const std::string piece = llama_token_to_piece(ctx, id);
          if (id == eos) {
              if (!allow_eos) {
                  candidates->data[i].logit = -INFINITY;
@@ -7711,7 +7712,7 @@ void llama_grammar_accept_token(struct llama_context * ctx, struct llama_grammar
          GGML_ASSERT(false);
      }
  
-    const std::string piece = llama_token_to_str(ctx, token);
+    const std::string piece = llama_token_to_piece(ctx, token);
  
      // Note terminating 0 in decoded string
      const auto   decoded     = decode_utf8(piece.c_str(), grammar->partial_utf8);
diff --git a/models/ggml-vocab-mpt.gguf b/models/ggml-vocab-mpt.gguf

new file mode 100644 (file)

index 0000000..6affa34

Binary files /dev/null and b/models/ggml-vocab-mpt.gguf differ
diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt

index 61407e5733e30dd5cf1865255c4ffa8bb58ba625..1c73de0a3e92eb8e201680038a402c8f4606b512 100644 (file)
--- a/tests/CMakeLists.txt
+++ b/tests/CMakeLists.txt
@@ -31,6 +31,7 @@ llama_test_executable (test-tokenizer-1-llama test-tokenizer-1-llama.cpp ${CMAKE
  llama_build_executable(test-tokenizer-1-bpe.cpp)
  llama_test_executable (test-tokenizer-1-falcon test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-falcon.gguf)
  llama_test_executable(test-tokenizer-1-aquila test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-aquila.gguf)
+llama_test_executable(test-tokenizer-1-mpt test-tokenizer-1-bpe.cpp ${CMAKE_CURRENT_SOURCE_DIR}/../models/ggml-vocab-mpt.gguf)
  llama_build_and_test_executable(test-grammar-parser.cpp)
  llama_build_and_test_executable(test-llama-grammar.cpp)
  llama_build_and_test_executable(test-grad0.cpp) # SLOW
author	goerch <redacted>
	Sun, 22 Oct 2023 19:21:42 +0000 (21:21 +0200)
committer	GitHub <redacted>
	Sun, 22 Oct 2023 19:21:42 +0000 (21:21 +0200)
convert-mpt-hf-to-gguf.py		patch \| blob \| history
llama.cpp		patch \| blob \| history
models/ggml-vocab-mpt.gguf	[new file with mode: 0644]	patch \| blob
tests/CMakeLists.txt		patch \| blob \| history