fix(gguf-py): special tokens are no longer skipped when add_<token>_token is set...

author Michaël de Vries <redacted>

Thu, 15 Feb 2024 13:14:37 +0000 (14:14 +0100)

committer GitHub <redacted>

Thu, 15 Feb 2024 13:14:37 +0000 (14:14 +0100)
author Michaël de Vries <redacted>
Thu, 15 Feb 2024 13:14:37 +0000 (14:14 +0100)
committer GitHub <redacted>
Thu, 15 Feb 2024 13:14:37 +0000 (14:14 +0100)
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py

index 5fba0171439bbead707a8ff30d625953b2f696bb..9986ce9deb44b2efcde39c3974927a1466cce688 100644 (file)
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -73,6 +73,8 @@ class Keys:
          UNK_ID           = "tokenizer.ggml.unknown_token_id"
          SEP_ID           = "tokenizer.ggml.seperator_token_id"
          PAD_ID           = "tokenizer.ggml.padding_token_id"
+        CLS_ID           = "tokenizer.ggml.cls_token_id"
+        MASK_ID          = "tokenizer.ggml.mask_token_id"
          ADD_BOS          = "tokenizer.ggml.add_bos_token"
          ADD_EOS          = "tokenizer.ggml.add_eos_token"
          ADD_PREFIX       = "tokenizer.ggml.add_space_prefix"
@@ -685,5 +687,7 @@ KEY_TOKENIZER_EOS_ID     = Keys.Tokenizer.EOS_ID
  KEY_TOKENIZER_UNK_ID     = Keys.Tokenizer.UNK_ID
  KEY_TOKENIZER_SEP_ID     = Keys.Tokenizer.SEP_ID
  KEY_TOKENIZER_PAD_ID     = Keys.Tokenizer.PAD_ID
+KEY_TOKENIZER_CLS_ID     = Keys.Tokenizer.CLS_ID
+KEY_TOKENIZER_MASK_ID    = Keys.Tokenizer.MASK_ID
  KEY_TOKENIZER_HF_JSON    = Keys.Tokenizer.HF_JSON
  KEY_TOKENIZER_RWKV       = Keys.Tokenizer.RWKV
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py

index d87bd8e88696c973e792c5ac90278746b6d3a08f..26724bf9438f82468b86f98af5fe6c2230decc04 100644 (file)
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -414,6 +414,12 @@ class GGUFWriter:
      def add_pad_token_id(self, id: int) -> None:
          self.add_uint32(Keys.Tokenizer.PAD_ID, id)
  
+    def add_cls_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.CLS_ID, id)
+
+    def add_mask_token_id(self, id: int) -> None:
+        self.add_uint32(Keys.Tokenizer.MASK_ID, id)
+
      def add_add_bos_token(self, value: bool) -> None:
          self.add_bool(Keys.Tokenizer.ADD_BOS, value)
  
diff --git a/gguf-py/gguf/vocab.py b/gguf-py/gguf/vocab.py

index cd19429754c618ed96682a1404aecd7159d19f0e..a23136b1886641f38b8499b010b5ba6c508bae94 100644 (file)
--- a/gguf-py/gguf/vocab.py
+++ b/gguf-py/gguf/vocab.py
@@ -29,7 +29,7 @@ class SpecialVocab:
          if special_token_types is not None:
              self.special_token_types = special_token_types
          else:
-            self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad')
+            self.special_token_types = ('bos', 'eos', 'unk', 'sep', 'pad', 'cls', 'mask')
          self._load(Path(path))
  
      def __repr__(self) -> str:
@@ -152,10 +152,6 @@ class SpecialVocab:
              add_entry = tokenizer_config.get(f'add_{typ}_token')
              if isinstance(add_entry, bool):
                  self.add_special_token[typ] = add_entry
-            if not added_tokens:
-                # We will need this to get the content for the token, so if it's empty
-                # may as well just give up.
-                continue
              entry = tokenizer_config.get(f'{typ}_token')
              if isinstance(entry, str):
                  tc_content = entry
author	Michaël de Vries <redacted>
	Thu, 15 Feb 2024 13:14:37 +0000 (14:14 +0100)
committer	GitHub <redacted>
	Thu, 15 Feb 2024 13:14:37 +0000 (14:14 +0100)
gguf-py/gguf/constants.py		patch \| blob \| history
gguf-py/gguf/gguf_writer.py		patch \| blob \| history
gguf-py/gguf/vocab.py		patch \| blob \| history