* convert: avoid token collision when stripping ## prefix
* convert: use token types for BERT special tokens check
* Update convert_hf_to_gguf.py
Co-authored-by: Sigbjørn Skjæret <redacted>
---------
Co-authored-by: Sigbjørn Skjæret <redacted>
self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
# convert to phantom space vocab
- def phantom(tok):
- if tok.startswith("[") and tok.endswith("]"):
+ def phantom(tok, toktype):
+ if toktype == gguf.TokenType.CONTROL:
return tok
if tok.startswith("##"):
return tok[2:]
return "\u2581" + tok
- tokens = list(map(phantom, tokens))
+ assert len(tokens) == len(toktypes)
+ tokens = list(map(phantom, tokens, toktypes))
# add vocab to gguf
self.gguf_writer.add_tokenizer_model("bert")