There are ways that special tokens or other new tokens could be added to the tokenizer; therefore it's probably best not to assume the vocabulary is only 32000 tokens.
fout.write(struct.pack("i", ftype))
# Is this correct??
- for i in range(32000):
+ for i in range(tokenizer.vocab_size()):
if tokenizer.is_unknown(i):
# "<unk>" token (translated as ??)
text = " \u2047 ".encode("utf-8")