tests : multi-thread the tokenizer tests (#5474)

author Georgi Gerganov <redacted>

Tue, 13 Feb 2024 13:14:22 +0000 (15:14 +0200)

committer GitHub <redacted>

Tue, 13 Feb 2024 13:14:22 +0000 (15:14 +0200)
author Georgi Gerganov <redacted>
Tue, 13 Feb 2024 13:14:22 +0000 (15:14 +0200)
committer GitHub <redacted>
Tue, 13 Feb 2024 13:14:22 +0000 (15:14 +0200)
diff --git a/llama.cpp b/llama.cpp

index eb6c46f3672f973f472ab66370c3810edb4aacbf..381a030683cb54a9154a52146d8c39f8957651c9 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -7782,7 +7782,7 @@ struct llm_bigram_spm {
  };
  
  struct llm_tokenizer_spm {
-    llm_tokenizer_spm(const llama_vocab & vocab): vocab(vocab) {}
+    llm_tokenizer_spm(const llama_vocab & vocab) : vocab(vocab) {}
  
      void tokenize(const std::string & text, std::vector<llama_vocab::id> & output) {
          // split string into utf8 chars
@@ -7857,6 +7857,7 @@ private:
  
          if (p == rev_merge.end()) {
              // output any symbols that did not form tokens as bytes.
+            output.reserve(output.size() + symbol.n);
              for (int j = 0; j < (int)symbol.n; ++j) {
                  llama_vocab::id token_id = llama_byte_to_token(vocab, symbol.text[j]);
                  output.push_back(token_id);
@@ -8419,17 +8420,18 @@ struct fragment_buffer_variant {
          token(_token),
          raw_text(_dummy),
          offset(0),
-        length(0){}
+        length(0) {}
+
      fragment_buffer_variant(const std::string & _raw_text, int64_t _offset, int64_t _length)
      :
          type(FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT),
-        token((llama_vocab::id)-1),
+        token((llama_vocab::id) - 1),
          raw_text(_raw_text),
          offset(_offset),
          length(_length){
-            GGML_ASSERT( _offset >= 0 );
-            GGML_ASSERT( _length >= 1 );
-            GGML_ASSERT( offset + length <= raw_text.length() );
+            GGML_ASSERT(_offset >= 0);
+            GGML_ASSERT(_length >= 1);
+            GGML_ASSERT(offset + length <= raw_text.length());
          }
  
      const FRAGMENT_BUFFER_VARIANT_TYPE type;
@@ -8553,14 +8555,14 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
      }
  
      std::forward_list<fragment_buffer_variant> fragment_buffer;
-    fragment_buffer.emplace_front( raw_text, 0, raw_text.length() );
+    fragment_buffer.emplace_front(raw_text, 0, raw_text.length());
  
-    if (special) tokenizer_st_partition( vocab, fragment_buffer );
+    if (special) tokenizer_st_partition(vocab, fragment_buffer);
  
      switch (vocab.type) {
          case LLAMA_VOCAB_TYPE_SPM:
              {
-                for (const auto & fragment: fragment_buffer) {
+                for (const auto & fragment : fragment_buffer) {
                      if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                          // without adding this leading whitespace, we do not get the same results as the original tokenizer
  
@@ -8588,7 +8590,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
              } break;
          case LLAMA_VOCAB_TYPE_BPE:
              {
-                for (const auto & fragment: fragment_buffer) {
+                for (const auto & fragment : fragment_buffer) {
                      if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                          auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
  
@@ -8604,7 +8606,7 @@ static std::vector<llama_vocab::id> llama_tokenize_internal(const llama_vocab &
              } break;
          case LLAMA_VOCAB_TYPE_WPM:
              {
-                for (const auto & fragment: fragment_buffer) {
+                for (const auto & fragment : fragment_buffer) {
                      if (fragment.type == FRAGMENT_BUFFER_VARIANT_TYPE_RAW_TEXT) {
                          auto raw_text = fragment.raw_text.substr(fragment.offset, fragment.length);
  
diff --git a/tests/test-tokenizer-1-bpe.cpp b/tests/test-tokenizer-1-bpe.cpp

index 386530f23f92cce2d85c74cb1bf01f7af61dc776..3bb6295613fa6e8e66d9f7b88080185bdcd1713d 100644 (file)
--- a/tests/test-tokenizer-1-bpe.cpp
+++ b/tests/test-tokenizer-1-bpe.cpp
@@ -4,13 +4,13 @@
  #include "console.h"
  
  #include <cassert>
+#include <codecvt>
  #include <cstdio>
  #include <cstring>
+#include <locale>
  #include <string>
-#include <codecvt>
-#include <map>
+#include <thread>
  #include <vector>
-#include <locale>
  
  int main(int argc, char **argv) {
      if (argc < 2) {
@@ -74,45 +74,46 @@ int main(int argc, char **argv) {
              }
          }
          catch (const std::invalid_argument &) {
-            fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
+            //fprintf(stderr, "%s : info: utf8 conversion %d '%s'\n", __func__, i, str.c_str());
          }
      }
  
-    for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
-        // NOTE: these exceptions seem to be necessary, because the GPT2 tokenizer doesn't want to interfere with some ASCII control characters
-        if ((cp < 0x03 || cp > 0x05) && cp != 0x0b && cp != 0x11 && (cp < 0x13 || cp > 0x17) && cp != 0x19 && (cp < 0x1c || cp > 0x1e) && (cp < 0xd800 || cp > 0xdfff)) {
-            std::string str = " " + codepoint_to_utf8(cp);
-            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-            std::string check = llama_detokenize_bpe(ctx, tokens);
-            if (str != check) {
-                fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                    __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-                return 3;
-            }
-        }
-    }
-    // Restrict to assigned unicode planes
-    // for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
-    for (uint32_t cp = 0x10000; cp < 0x00040000; ++cp) {
-        std::string str = codepoint_to_utf8(cp);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-        std::string check = llama_detokenize_bpe(ctx, tokens);
-        if (str != check) {
-            fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-            return 4;
+    // unicode
+    {
+        const int nthread = std::thread::hardware_concurrency();
+
+        std::vector<std::thread> threads(nthread);
+
+        for (int i = 0; i < nthread; ++i) {
+            threads[i] = std::thread([i, nthread, ctx]() {
+                for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
+                    if (!( // NOLINT
+                                (cp < 0x03       || cp >  0x05)   && cp != 0x0b && cp != 0x11 &&
+                                (cp < 0x13       || cp >  0x17)   && cp != 0x19 &&
+                                (cp < 0x1c       || cp >  0x1e)   &&
+                                (cp < 0xd800     || cp >  0xdfff) &&
+                                (cp < 0x00040000 || cp >= 0x000e0000)
+                        )) {
+                        continue;
+                    }
+
+                    std::string str = codepoint_to_utf8(cp);
+                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
+                    std::string check = llama_detokenize_bpe(ctx, tokens);
+                    if (cp != 9601 && str != check) {
+                        fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
+                                cp, check.c_str(), check.length(), str.c_str(), str.length());
+                        std::exit(3);
+                    }
+                }
+            });
          }
-    }
-    for (uint32_t cp = 0x000e0000; cp < 0x0010ffff; ++cp) {
-        std::string str = codepoint_to_utf8(cp);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-        std::string check = llama_detokenize_bpe(ctx, tokens);
-        if (str != check) {
-            fprintf(stderr, "%s : error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-            return 4;
+
+        for (auto & t : threads) {
+            t.join();
          }
      }
+
      llama_free_model(model);
      llama_free(ctx);
  
diff --git a/tests/test-tokenizer-1-llama.cpp b/tests/test-tokenizer-1-llama.cpp

index 4b58fe4954cf3fa4e762000fca743d667400566b..b0d814a417fb6d28fc83468cb68981304090ca91 100644 (file)
--- a/tests/test-tokenizer-1-llama.cpp
+++ b/tests/test-tokenizer-1-llama.cpp
@@ -4,13 +4,13 @@
  #include "console.h"
  
  #include <cassert>
+#include <codecvt>
  #include <cstdio>
  #include <cstring>
+#include <locale>
  #include <string>
-#include <codecvt>
-#include <map>
+#include <thread>
  #include <vector>
-#include <locale>
  
  int main(int argc, char **argv) {
      if (argc < 2) {
@@ -72,26 +72,33 @@ int main(int argc, char **argv) {
          }
      }
  
-    for (uint32_t cp = 0x0000; cp < 0xffff; ++cp) {
-        if (cp < 0xd800 || cp > 0xdfff) {
-            std::string str = codepoint_to_utf8(cp);
-            std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-            std::string check = llama_detokenize_spm(ctx, tokens);
-            if (cp != 9601 && str != check) {
-                fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                    __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-                return 3;
-            }
+    // unicode
+    {
+        const int nthread = std::thread::hardware_concurrency();
+
+        std::vector<std::thread> threads(nthread);
+
+        for (int i = 0; i < nthread; ++i) {
+            threads[i] = std::thread([i, nthread, ctx]() {
+                for (uint32_t cp = i; cp < 0x0010ffff; cp += nthread) {
+                    if (cp >= 0xd800 && cp <= 0xdfff) {
+                        continue;
+                    }
+
+                    std::string str = codepoint_to_utf8(cp);
+                    std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
+                    std::string check = llama_detokenize_spm(ctx, tokens);
+                    if (cp != 9601 && str != check) {
+                        fprintf(stderr, "error: codepoint %x detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
+                                cp, check.c_str(), check.length(), str.c_str(), str.length());
+                        std::exit(3);
+                    }
+                }
+            });
          }
-    }
-    for (uint32_t cp = 0x10000; cp < 0x0010ffff; ++cp) {
-        std::string str = codepoint_to_utf8(cp);
-        std::vector<llama_token> tokens = llama_tokenize(ctx, str, false);
-        std::string check = llama_detokenize_spm(ctx, tokens);
-        if (str != check) {
-            fprintf(stderr, "%s : error: codepoint %d detokenizes to '%s'(%zu) instead of '%s'(%zu)\n",
-                __func__, cp, check.c_str(), check.length(), str.c_str(), str.length());
-            return 4;
+
+        for (auto & t : threads) {
+            t.join();
          }
      }
  
diff --git a/unicode.h b/unicode.h

index 844eff3dad1b3fde6223b18aedd6655e3238e76f..263260702e640a2062ba0d0bcfae9aa6d236536a 100644 (file)
--- a/unicode.h
+++ b/unicode.h
@@ -264,26 +264,29 @@ static uint32_t codepoint_from_utf8(const std::string & utf8, size_t & offset) {
          offset += 1;
          return result;
      }
-    else if (!(utf8[offset + 0] & 0x40)) {
+    if (!(utf8[offset + 0] & 0x40)) {
          throw std::invalid_argument("invalid character");
      }
-    else if (!(utf8[offset + 0] & 0x20)) {
-        if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80))
+    if (!(utf8[offset + 0] & 0x20)) {
+        if (offset + 1 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80)) {
              throw std::invalid_argument("invalid character");
+        }
          auto result = ((utf8[offset + 0] & 0x1f) << 6) | (utf8[offset + 1] & 0x3f);
          offset += 2;
          return result;
      }
-    else if (!(utf8[offset + 0] & 0x10)) {
-        if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80))
+    if (!(utf8[offset + 0] & 0x10)) {
+        if (offset + 2 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80)) {
              throw std::invalid_argument("invalid character");
+        }
          auto result = ((utf8[offset + 0] & 0x0f) << 12) | ((utf8[offset + 1] & 0x3f) << 6) | (utf8[offset + 2] & 0x3f);
          offset += 3;
          return result;
      }
-    else if (!(utf8[offset + 0] & 0x08)) {
-        if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80))
+    if (!(utf8[offset + 0] & 0x08)) {
+        if (offset + 3 >= utf8.size() || ! ((utf8[offset + 1] & 0xc0) == 0x80) || ! ((utf8[offset + 2] & 0xc0) == 0x80) || !((utf8[offset + 3] & 0xc0) == 0x80)) {
              throw std::invalid_argument("invalid character");
+        }
          auto result = ((utf8[offset + 0] & 0x07) << 18) | ((utf8[offset + 1] & 0x3f) << 12) | ((utf8[offset + 2] & 0x3f) << 6) | (utf8[offset + 3] & 0x3f);
          offset += 4;
          return result;
@@ -331,21 +334,22 @@ static uint32_t codepoint_from_utf16(const std::vector<uint16_t> & utf16, size_t
          offset += 1;
          return result;
      }
-    else {
-        if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00))
-            throw std::invalid_argument("invalid character");
-        auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
-        offset += 2;
-        return result;
+
+    if (offset + 1 >= utf16.size() || !((utf16[1] & 0xdc00) == 0xdc00)) {
+        throw std::invalid_argument("invalid character");
      }
-    throw std::invalid_argument("invalid string");
+
+    auto result = 0x10000 + (((utf16[0] & 0x03ff) << 10) | (utf16[1] & 0x03ff));
+    offset += 2;
+    return result;
  }
  
  static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> & utf16) {
      std::vector<uint32_t> result;
      size_t offset = 0;
-    while (offset < utf16.size())
+    while (offset < utf16.size()) {
          result.push_back(codepoint_from_utf16(utf16, offset));
+    }
      return result;
  }
  
@@ -361,44 +365,52 @@ static std::vector<uint32_t> codepoints_from_utf16(const std::vector<uint16_t> &
  static std::unordered_map<uint32_t, int> codepoint_type_map() {
      std::unordered_map<uint32_t, int> codepoint_types;
      for (auto p : digit_ranges) {
-        for(auto i = p.first; i <= p.second; ++ i)
+        for (auto i = p.first; i <= p.second; ++ i) {
              codepoint_types[i] = CODEPOINT_TYPE_DIGIT;
+        }
      }
-    for(auto p : letter_ranges) {
-        for(auto i = p.first; i <= p.second; ++ i)
+    for (auto p : letter_ranges) {
+        for (auto i = p.first; i <= p.second; ++ i) {
              codepoint_types[i] = CODEPOINT_TYPE_LETTER;
+        }
      }
-    for(auto p : whitespace_ranges) {
-        for(auto i = p.first; i <= p.second; ++ i)
+    for (auto p : whitespace_ranges) {
+        for (auto i = p.first; i <= p.second; ++ i) {
              codepoint_types[i] = CODEPOINT_TYPE_WHITESPACE;
+        }
      }
-    for(auto p : accent_mark_ranges) {
-        for(auto i = p.first; i <= p.second; ++ i)
+    for (auto p : accent_mark_ranges) {
+        for (auto i = p.first; i <= p.second; ++ i) {
              codepoint_types[i] = CODEPOINT_TYPE_ACCENT_MARK;
+        }
      }
-    for(auto p : punctuation_ranges) {
-        for(auto i = p.first; i <= p.second; ++ i)
+    for (auto p : punctuation_ranges) {
+        for (auto i = p.first; i <= p.second; ++ i) {
              codepoint_types[i] = CODEPOINT_TYPE_PUNCTUATION;
+        }
      }
-    for (auto p : symbol_ranges) {
-        for (auto i = p.first; i <= p.second; ++i)
+    for  (auto p : symbol_ranges) {
+        for (auto i = p.first; i <= p.second; ++i) {
              codepoint_types[i] = CODEPOINT_TYPE_SYMBOL;
+        }
      }
-    for(auto p : control_ranges) {
-        for(auto i = p.first; i <= p.second; ++ i)
+    for (auto p : control_ranges) {
+        for (auto i = p.first; i <= p.second; ++ i) {
              codepoint_types[i] = CODEPOINT_TYPE_CONTROL;
+        }
      }
      return codepoint_types;
  }
  
  static int codepoint_type(uint32_t cp) {
      static std::unordered_map<uint32_t, int> codepoint_types = codepoint_type_map();
-    return codepoint_types[cp];
+    return codepoint_types.find(cp) == codepoint_types.end() ? CODEPOINT_TYPE_UNIDENTIFIED : codepoint_types.at(cp);
  }
  
  static int codepoint_type(const std::string & utf8) {
-    if (utf8.length() == 0)
+    if (utf8.length() == 0) {
          return CODEPOINT_TYPE_UNIDENTIFIED;
+    }
      size_t offset = 0;
      return codepoint_type(codepoint_from_utf8(utf8, offset));
  }
author	Georgi Gerganov <redacted>
	Tue, 13 Feb 2024 13:14:22 +0000 (15:14 +0200)
committer	GitHub <redacted>
	Tue, 13 Feb 2024 13:14:22 +0000 (15:14 +0200)
llama.cpp		patch \| blob \| history
tests/test-tokenizer-1-bpe.cpp		patch \| blob \| history
tests/test-tokenizer-1-llama.cpp		patch \| blob \| history
unicode.h		patch \| blob \| history