From: klosax Date: Sun, 21 May 2023 08:21:51 +0000 (+0200) Subject: common : support utf-8 + fix gpt_tokenize + fix MPT model import (#179) X-Git-Tag: upstream/0.0.1642~1450 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=36a005c930cf8fb1f68f0421c5a848a0c8825f5a;p=pkg%2Fggml%2Fsources%2Fggml common : support utf-8 + fix gpt_tokenize + fix MPT model import (#179) * Update convert-h5-to-ggml.py * Import tokens correctly * gpt_tokenize: Convert input to utf-8 + bug fix * common : minor style fixes --------- Co-authored-by: Georgi Gerganov --- diff --git a/examples/common.cpp b/examples/common.cpp index 76da30d9..eaaaa606 100644 --- a/examples/common.cpp +++ b/examples/common.cpp @@ -212,12 +212,38 @@ void gpt_vocab::add_special_token(const std::string & token) { special_tokens.push_back(token); } +static void append_utf8(char32_t ch, std::string & out) { + if (ch <= 0x7F) { + out.push_back(static_cast(ch)); + } else if (ch <= 0x7FF) { + out.push_back(static_cast(0xC0 | ((ch >> 6) & 0x1F))); + out.push_back(static_cast(0x80 | (ch & 0x3F))); + } else if (ch <= 0xFFFF) { + out.push_back(static_cast(0xE0 | ((ch >> 12) & 0x0F))); + out.push_back(static_cast(0x80 | ((ch >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (ch & 0x3F))); + } else if (ch <= 0x10FFFF) { + out.push_back(static_cast(0xF0 | ((ch >> 18) & 0x07))); + out.push_back(static_cast(0x80 | ((ch >> 12) & 0x3F))); + out.push_back(static_cast(0x80 | ((ch >> 6) & 0x3F))); + out.push_back(static_cast(0x80 | (ch & 0x3F))); + } else { + printf("Invalid Unicode code point\n"); + } +} + std::vector gpt_tokenize(const gpt_vocab & vocab, const std::string & text) { std::vector words; + // Convert input to utf-8 + std::string utf8conv; + for (int w = 0; w < text.size(); w++) { + append_utf8( uint8_t(text[w]), utf8conv); + } + // first split the text into words { - std::string str = text; + std::string str = utf8conv; std::string pat = R"('s|'t|'re|'ve|'m|'ll|'d| ?[[:alpha:]]+| ?[[:digit:]]+| ?[^\s[:alpha:][:digit:]]+|\s+(?!\S)|\s+)"; // Generate the subpattern from the special_tokens vector if it's not empty @@ -260,7 +286,7 @@ std::vector gpt_tokenize(const gpt_vocab & vocab, const std::stri tokens.push_back(it->second); i = j; j = n; - break; + continue; } --j; } diff --git a/examples/mpt/convert-h5-to-ggml.py b/examples/mpt/convert-h5-to-ggml.py index 9bff9d3f..b61ec874 100644 --- a/examples/mpt/convert-h5-to-ggml.py +++ b/examples/mpt/convert-h5-to-ggml.py @@ -5,6 +5,31 @@ import numpy as np from transformers import AutoModelForCausalLM, AutoTokenizer import sentencepiece.sentencepiece_model_pb2 as model +# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py +def bytes_to_unicode(): + + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + + cs = [chr(n) for n in cs] + + return dict(zip(bs, cs)) + if len(sys.argv) < 3: print("Usage: convert-h5-to-ggml.py dir-model [use-f32]\n") print(" ftype == 0 -> float32") @@ -62,15 +87,37 @@ fout.write(struct.pack("f", hparams["attn_config"]["alibi_bias_max"])) fout.write(struct.pack("f", hparams["attn_config"]["clip_qkv"] or 0.0)) fout.write(struct.pack("i", ftype)) +vocab_size = hparams["vocab_size"] + +encoder = tokenizer.vocab +# Add added_tokens (special tokens) to the encoder +encoder.update(tokenizer.get_added_vocab()) + +byte_encoder = bytes_to_unicode() +byte_decoder = {v:k for k, v in byte_encoder.items()} + +counter = 0 +# sort by value +for key in sorted(encoder, key=encoder.get): + # workaround for key error when c = whitespace + text="" + for c in key: + if c == " ": + text += " " + else: + text += chr(byte_decoder[c] ) + text = bytearray( text, encoding="utf-8" ) + fout.write(struct.pack("i", len(text))) + fout.write(text) + counter += 1 -# TODO: temporary hack to not deal with implementing the tokenizer -dot_token = tokenizer.encode(".")[0] -for i in range(hparams["vocab_size"]): - text = tokenizer.decode([dot_token, i]).encode("utf-8") - # remove the first byte (it's always '.') - text = text[1:] +# Repeat last token until vocab_size +while counter < vocab_size: fout.write(struct.pack("i", len(text))) fout.write(text) + counter += 1 + +# assert counter == config.vocab_size for name in list_vars.keys(): data = list_vars[name].squeeze().numpy()