Improve handling of special tokens in GGML to GGUF converter (#2725)

author Kerfuffle <redacted>

Tue, 22 Aug 2023 23:39:39 +0000 (17:39 -0600)

committer GitHub <redacted>

Tue, 22 Aug 2023 23:39:39 +0000 (17:39 -0600)
author Kerfuffle <redacted>
Tue, 22 Aug 2023 23:39:39 +0000 (17:39 -0600)
committer GitHub <redacted>
Tue, 22 Aug 2023 23:39:39 +0000 (17:39 -0600)
diff --git a/convert-llama-ggmlv3-to-gguf.py b/convert-llama-ggmlv3-to-gguf.py

index fa4a044ca383e159783869190dd34da3b9a1bab7..5b038fc0a3baf320d7bd4c5ba3832fae701d7219 100644 (file)
--- a/convert-llama-ggmlv3-to-gguf.py
+++ b/convert-llama-ggmlv3-to-gguf.py
@@ -1,10 +1,12 @@
-import sys, struct, math, argparse
+import sys, struct, math, argparse, warnings
  from pathlib import Path
  
  import numpy as np
  
  import gguf
  
+warnings.filterwarnings('error')
+
  # Note: Does not support GGML_QKK_64
  QK_K = 256
  # Items here are (block size, type size)
@@ -215,15 +217,10 @@ class GGMLToGGUF:
          if self.vocab_override is not None:
              vo = self.vocab_override
              print('* Adding vocab item(s)')
-            for (idx, vitem) in enumerate(vo.all_tokens()):
-                if len(vitem) == 3:
-                    tokens.append(vitem[0])
-                    scores.append(vitem[1])
-                    toktypes.append(vitem[2])
-                else:
-                    # Maybe try to guess the token type here?
-                    tokens.append(vitem[0])
-                    scores.append(vitem[1])
+            for (idx, (vbytes, score, ttype)) in enumerate(vo.all_tokens()):
+                tokens.append(vbytes)
+                scores.append(score)
+                toktypes.append(ttype)
              assert len(tokens) == hp.n_vocab, f'Override vocab has a different number of items than hyperparameters - override = {len(tokens)} but n_vocab={hp.n_vocab}'
              gguf_writer.add_token_list(tokens)
              gguf_writer.add_token_scores(scores)
@@ -231,9 +228,21 @@ class GGMLToGGUF:
                  gguf_writer.add_token_types(toktypes)
              return
          print(f'* Adding {hp.n_vocab} vocab item(s)')
+        assert len(self.model.vocab.items) >= 3, 'Cannot handle unexpectedly short model vocab'
          for (tokid, (vbytes, vscore)) in enumerate(self.model.vocab.items):
              tt = 1 # Normal
-            if len(vbytes) == 0:
+            # Special handling for UNK, BOS, EOS tokens.
+            if tokid <= 2:
+                if tokid == 0:
+                    vbytes = b'<unk>'
+                    tt = 2
+                elif tokid == 1:
+                    vbytes = b'<s>'
+                    tt = 3
+                else:
+                    vbytes = b'</s>'
+                    tt = 3
+            elif len(vbytes) == 0:
                  tt = 3 # Control
              elif tokid >= 3 and tokid <= 258 and len(vbytes) == 1:
                  vbytes = bytes(f'<0x{vbytes[0]:02X}>', encoding = 'UTF-8')
@@ -246,6 +255,9 @@ class GGMLToGGUF:
          gguf_writer.add_token_list(tokens)
          gguf_writer.add_token_scores(scores)
          gguf_writer.add_token_types(toktypes)
+        gguf_writer.add_unk_token_id(0)
+        gguf_writer.add_bos_token_id(1)
+        gguf_writer.add_eos_token_id(2)
  
      def add_tensors(self, gguf_writer):
          nm = self.name_map
@@ -315,7 +327,11 @@ def main():
      data = np.memmap(cfg.input, mode = 'r')
      model = GGMLV3Model()
      print('* Scanning GGML input file')
-    offset = model.load(data, 0)
+    try:
+        offset = model.load(data, 0)
+    except OverflowError:
+        print(f'!!! Caught overflow loading tensors. The most likely issue is running on Windows but not in WSL. Try running in WSL if possible.', file = sys.stderr)
+        raise
      print(f'* GGML model hyperparameters: {model.hyperparameters}')
      vocab_override = None
      params_override = None
@@ -330,4 +346,5 @@ def main():
      converter.save()
      print(f'* Successful completion. Output saved to: {cfg.output}')
  
-main()
+if __name__ == '__main__':
+    main()
diff --git a/llama.cpp b/llama.cpp

index 6c5da130926fcbbb8d2a97418d84d55f99ca78ca..fd8eaa1800bde19e74a3c45149b756b8f320ee1f 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -703,7 +703,7 @@ struct llama_vocab {
      // default LLaMA special tokens
      id special_bos_id = 1;
      id special_eos_id = 2;
-    id special_unk_id = -1;
+    id special_unk_id = 0;
      id special_sep_id = -1;
      id special_pad_id = -1;
author	Kerfuffle <redacted>
	Tue, 22 Aug 2023 23:39:39 +0000 (17:39 -0600)
committer	GitHub <redacted>
	Tue, 22 Aug 2023 23:39:39 +0000 (17:39 -0600)
convert-llama-ggmlv3-to-gguf.py		patch \| blob \| history
llama.cpp		patch \| blob \| history