From: klosax <redacted>
Date: Sun, 21 May 2023 08:21:51 +0000 (+0200)
Subject: common : support utf-8 + fix gpt_tokenize + fix MPT model import (#179)
X-Git-Tag: upstream/0.0.1642~1450
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=36a005c930cf8fb1f68f0421c5a848a0c8825f5a;p=pkg%2Fggml%2Fsources%2Fggml

common : support utf-8 + fix gpt_tokenize + fix MPT model import (#179)

* Update convert-h5-to-ggml.py

* Import tokens correctly

* gpt_tokenize: Convert input to utf-8 + bug fix

* common : minor style fixes

---------

Co-authored-by: Georgi Gerganov <redacted>
---

diff --git a/examples/common.cpp b/examples/common.cpp
index 76da30d9..eaaaa606 100644
--- a/examples/common.cpp
+++ b/examples/common.cpp
@@ -212,12 +212,38 @@ void gpt_vocab::add_special_token(const std::string & token) {
     special_tokens.push_back(token);
 }
 
+static void append_utf8(char32_t ch, std::string & out) {
+    if (ch <= 0x7F) {
+        out.push_back(static_cast<unsigned char>(ch));
+    } else if (ch <= 0x7FF) {
+        out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
+        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+    } else if (ch <= 0xFFFF) {
+        out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
+        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
+        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+    } else if (ch <= 0x10FFFF) {
+        out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
+        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
+        out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
+        out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+    } else {
+        printf("Invalid Unicode code point\n");
+    }
+}
+
 std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
     std::vector<std::string> words;
 
+    // Convert input to utf-8
+    std::string utf8conv;
+    for (int w = 0; w < text.size(); w++) {
+        append_utf8( uint8_t(text[w]), utf8conv);
+    }
+    
     // first split the text into words
     {
-        std::string str = text;
+        std::string str = utf8conv;
         std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
 
         // Generate the subpattern from the special_tokens vector if it's not empty
@@ -260,7 +286,7 @@ std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::stri
                     tokens.push_back(it->second);
                     i = j;
                     j = n;
-                    break;
+                    continue;
                 }
                 --j;
             }
diff --git a/examples/mpt/convert-h5-to-ggml.py b/examples/mpt/convert-h5-to-ggml.py
index 9bff9d3f..b61ec874 100644
--- a/examples/mpt/convert-h5-to-ggml.py
+++ b/examples/mpt/convert-h5-to-ggml.py
@@ -5,6 +5,31 @@ import numpy as np
 from transformers import AutoModelForCausalLM, AutoTokenizer
 import sentencepiece.sentencepiece_model_pb2 as model
 
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+
+    """
+    Returns list of utf-8 byte and a corresponding list of unicode strings.
+    The reversible bpe codes work on unicode strings.
+    This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+    When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+    This is a signficant percentage of your normal, say, 32K bpe vocab.
+    To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+    And avoids mapping to whitespace/control characters the bpe code barfs on.
+    """
+    bs = list(range(ord("!"), ord("~")+1))+list(range(ord("Â¡"), ord("Â¬")+1))+list(range(ord("Â®"), ord("Ã¿")+1))
+    cs = bs[:]
+    n = 0
+    for b in range(2**8):
+        if b not in bs:
+            bs.append(b)
+            cs.append(2**8+n)
+            n += 1
+
+    cs = [chr(n) for n in cs]
+
+    return dict(zip(bs, cs))
+
 if len(sys.argv) < 3:
     print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
     print("  ftype == 0 -> float32")
@@ -62,15 +87,37 @@ fout.write(struct.pack("f", hparams["attn_config"]["alibi_bias_max"]))
 fout.write(struct.pack("f", hparams["attn_config"]["clip_qkv"] or 0.0))
 fout.write(struct.pack("i", ftype))
 
+vocab_size = hparams["vocab_size"]
+
+encoder = tokenizer.vocab
+# Add added_tokens (special tokens) to the encoder
+encoder.update(tokenizer.get_added_vocab())
+
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v:k for k, v in byte_encoder.items()}
+
+counter = 0
+# sort by value
+for key in sorted(encoder, key=encoder.get):
+    # workaround for key error when c = whitespace
+    text=""
+    for c in key:
+        if c == " ":
+            text += " "
+        else:
+            text += chr(byte_decoder[c] )
+    text = bytearray( text, encoding="utf-8" )
+    fout.write(struct.pack("i", len(text)))
+    fout.write(text)
+    counter += 1
 
-# TODO: temporary hack to not deal with implementing the tokenizer
-dot_token = tokenizer.encode(".")[0]
-for i in range(hparams["vocab_size"]):
-    text = tokenizer.decode([dot_token, i]).encode("utf-8")
-    # remove the first byte (it's always '.')
-    text = text[1:]
+# Repeat last token until vocab_size
+while counter < vocab_size:
     fout.write(struct.pack("i", len(text)))
     fout.write(text)
+    counter += 1
+
+# assert counter == config.vocab_size
 
 for name in list_vars.keys():
     data = list_vars[name].squeeze().numpy()