* fix(gguf-py): special tokens are no longer skipped when add_<token>_token is set to false
* fix(gguf-py): added missing cls and mask token ids to the gguf metadata
UNK_ID = "tokenizer.ggml.unknown_token_id"
SEP_ID = "tokenizer.ggml.seperator_token_id"
PAD_ID = "tokenizer.ggml.padding_token_id"
+ CLS_ID = "tokenizer.ggml.cls_token_id"
+ MASK_ID = "tokenizer.ggml.mask_token_id"
ADD_BOS = "tokenizer.ggml.add_bos_token"
ADD_EOS = "tokenizer.ggml.add_eos_token"
ADD_PREFIX = "tokenizer.ggml.add_space_prefix"
KEY_TOKENIZER_UNK_ID = Keys.Tokenizer.UNK_ID
KEY_TOKENIZER_SEP_ID = Keys.Tokenizer.SEP_ID
KEY_TOKENIZER_PAD_ID = Keys.Tokenizer.PAD_ID
+KEY_TOKENIZER_CLS_ID = Keys.Tokenizer.CLS_ID
+KEY_TOKENIZER_MASK_ID = Keys.Tokenizer.MASK_ID
KEY_TOKENIZER_HF_JSON = Keys.Tokenizer.HF_JSON
KEY_TOKENIZER_RWKV = Keys.Tokenizer.RWKV
def add_pad_token_id(self, id: int) -> None:
self.add_uint32(Keys.Tokenizer.PAD_ID, id)
+ def add_cls_token_id(self, id: int) -> None:
+ self.add_uint32(Keys.Tokenizer.CLS_ID, id)
+
+ def add_mask_token_id(self, id: int) -> None:
+ self.add_uint32(Keys.Tokenizer.MASK_ID, id)
+
def add_add_bos_token(self, value: bool) -> None:
self.add_bool(Keys.Tokenizer.ADD_BOS, value)
if special_token_types is not None:
self.special_token_types = special_token_types
else:
- self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad')
+ self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad', 'cls', 'mask')
self._load(Path(path))
def __repr__(self) -> str:
add_entry = tokenizer_config.get(f'add_{typ}_token')
if isinstance(add_entry, bool):
self.add_special_token[typ] = add_entry
- if not added_tokens:
- # We will need this to get the content for the token, so if it's empty
- # may as well just give up.
- continue
entry = tokenizer_config.get(f'{typ}_token')
if isinstance(entry, str):
tc_content = entry