convert : fix encoding of WPM vocab for BERT models (#18500)

author o7si <redacted>

Thu, 1 Jan 2026 17:27:07 +0000 (01:27 +0800)

committer GitHub <redacted>

Thu, 1 Jan 2026 17:27:07 +0000 (18:27 +0100)
author o7si <redacted>
Thu, 1 Jan 2026 17:27:07 +0000 (01:27 +0800)
committer GitHub <redacted>
Thu, 1 Jan 2026 17:27:07 +0000 (18:27 +0100)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index a1080b15f03efef9dc9434b30c46e87b1c222fa1..2c961b8f5961ca6537df9f789527c905f8e5d1ea 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -5287,13 +5287,14 @@ class BertModel(TextModel):
          self.gguf_writer.add_token_type_count(self.hparams.get("type_vocab_size", 1))
  
          # convert to phantom space vocab
-        def phantom(tok):
-            if tok.startswith("[") and tok.endswith("]"):
+        def phantom(tok, toktype):
+            if toktype == gguf.TokenType.CONTROL:
                  return tok
              if tok.startswith("##"):
                  return tok[2:]
              return "\u2581" + tok
-        tokens = list(map(phantom, tokens))
+        assert len(tokens) == len(toktypes)
+        tokens = list(map(phantom, tokens, toktypes))
  
          # add vocab to gguf
          self.gguf_writer.add_tokenizer_model("bert")