special_tokens.push_back(token);
}
+static void append_utf8(char32_t ch, std::string & out) {
+ if (ch <= 0x7F) {
+ out.push_back(static_cast<unsigned char>(ch));
+ } else if (ch <= 0x7FF) {
+ out.push_back(static_cast<unsigned char>(0xC0 | ((ch >> 6) & 0x1F)));
+ out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+ } else if (ch <= 0xFFFF) {
+ out.push_back(static_cast<unsigned char>(0xE0 | ((ch >> 12) & 0x0F)));
+ out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
+ out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+ } else if (ch <= 0x10FFFF) {
+ out.push_back(static_cast<unsigned char>(0xF0 | ((ch >> 18) & 0x07)));
+ out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 12) & 0x3F)));
+ out.push_back(static_cast<unsigned char>(0x80 | ((ch >> 6) & 0x3F)));
+ out.push_back(static_cast<unsigned char>(0x80 | (ch & 0x3F)));
+ } else {
+ printf("Invalid Unicode code point\n");
+ }
+}
+
std::vector<gpt_vocab::id> gpt_tokenize(const gpt_vocab & vocab, const std::string & text) {
std::vector<std::string> words;
+ // Convert input to utf-8
+ std::string utf8conv;
+ for (int w = 0; w < text.size(); w++) {
+ append_utf8( uint8_t(text[w]), utf8conv);
+ }
+
// first split the text into words
{
- std::string str = text;
+ std::string str = utf8conv;
std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)";
// Generate the subpattern from the special_tokens vector if it's not empty
tokens.push_back(it->second);
i = j;
j = n;
- break;
+ continue;
}
--j;
}
from transformers import AutoModelForCausalLM, AutoTokenizer
import sentencepiece.sentencepiece_model_pb2 as model
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+
+ """
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
+ The reversible bpe codes work on unicode strings.
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+ This is a signficant percentage of your normal, say, 32K bpe vocab.
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
+ """
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+ cs = bs[:]
+ n = 0
+ for b in range(2**8):
+ if b not in bs:
+ bs.append(b)
+ cs.append(2**8+n)
+ n += 1
+
+ cs = [chr(n) for n in cs]
+
+ return dict(zip(bs, cs))
+
if len(sys.argv) < 3:
print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n")
print(" ftype == 0 -> float32")
fout.write(struct.pack("f", hparams["attn_config"]["clip_qkv"] or 0.0))
fout.write(struct.pack("i", ftype))
+vocab_size = hparams["vocab_size"]
+
+encoder = tokenizer.vocab
+# Add added_tokens (special tokens) to the encoder
+encoder.update(tokenizer.get_added_vocab())
+
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v:k for k, v in byte_encoder.items()}
+
+counter = 0
+# sort by value
+for key in sorted(encoder, key=encoder.get):
+ # workaround for key error when c = whitespace
+ text=""
+ for c in key:
+ if c == " ":
+ text += " "
+ else:
+ text += chr(byte_decoder[c] )
+ text = bytearray( text, encoding="utf-8" )
+ fout.write(struct.pack("i", len(text)))
+ fout.write(text)
+ counter += 1
-# TODO: temporary hack to not deal with implementing the tokenizer
-dot_token = tokenizer.encode(".")[0]
-for i in range(hparams["vocab_size"]):
- text = tokenizer.decode([dot_token, i]).encode("utf-8")
- # remove the first byte (it's always '.')
- text = text[1:]
+# Repeat last token until vocab_size
+while counter < vocab_size:
fout.write(struct.pack("i", len(text)))
fout.write(text)
+ counter += 1
+
+# assert counter == config.vocab_size
for name in list_vars.keys():
data = list_vars[name].squeeze().numpy()