convert : fix vocab size when not defined in hparams (#3421)

author cebtenzzre <redacted>

Mon, 2 Oct 2023 22:07:24 +0000 (18:07 -0400)

committer GitHub <redacted>

Mon, 2 Oct 2023 22:07:24 +0000 (18:07 -0400)
author cebtenzzre <redacted>
Mon, 2 Oct 2023 22:07:24 +0000 (18:07 -0400)
committer GitHub <redacted>
Mon, 2 Oct 2023 22:07:24 +0000 (18:07 -0400)
diff --git a/convert-falcon-hf-to-gguf.py b/convert-falcon-hf-to-gguf.py

index 958358563ccdcfca15691c38c4e04aa1db45c6b8..3a9300c3771df6a64957aac962383460ec6a8b2f 100755 (executable)
--- a/convert-falcon-hf-to-gguf.py
+++ b/convert-falcon-hf-to-gguf.py
@@ -134,26 +134,19 @@ print("gguf: get tokenizer metadata")
  
  tokens: list[bytearray] = []
  
-tokenizer_json_file = dir_model / 'tokenizer.json'
-if not tokenizer_json_file.is_file():
-    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
-    sys.exit(1)
-
  # gpt2 tokenizer
  gguf_writer.add_tokenizer_model("gpt2")
  
-with open(tokenizer_json_file, "r", encoding="utf-8") as f:
-    tokenizer_json = json.load(f)
-
  print("gguf: get gpt2 tokenizer vocab")
  
-# The number of tokens in tokenizer.json can differ from the expected vocab size.
-# This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
-
  # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
  tokenizer = AutoTokenizer.from_pretrained(dir_model)
  
+# The number of tokens in tokenizer.json can differ from the expected vocab size.
+# This causes downstream issues with mismatched tensor sizes when running the inference
+vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+assert max(tokenizer.vocab.values()) < vocab_size
+
  reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
  byte_encoder = bytes_to_unicode()
  byte_decoder = {v: k for k, v in byte_encoder.items()}
diff --git a/convert-gptneox-hf-to-gguf.py b/convert-gptneox-hf-to-gguf.py

index 782410e44f2d1d92d832c699d749f830e0334434..60679a2f46dda3ec1a40746549335097d1f2e048 100755 (executable)
--- a/convert-gptneox-hf-to-gguf.py
+++ b/convert-gptneox-hf-to-gguf.py
@@ -131,24 +131,19 @@ print("gguf: get tokenizer metadata")
  
  tokens: list[bytearray] = []
  
-tokenizer_json_file = dir_model / 'tokenizer.json'
-if not tokenizer_json_file.is_file():
-    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
-    sys.exit(1)
-
  # gpt2 tokenizer
  gguf_writer.add_tokenizer_model("gpt2")
  
-with open(tokenizer_json_file, "r", encoding="utf-8") as f:
-    tokenizer_json = json.load(f)
-
  print("gguf: get gpt2 tokenizer vocab")
  
-vocab_size = len(tokenizer_json["model"]["vocab"])
-
  # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
  tokenizer = AutoTokenizer.from_pretrained(dir_model)
  
+# The number of tokens in tokenizer.json can differ from the expected vocab size.
+# This causes downstream issues with mismatched tensor sizes when running the inference
+vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+assert max(tokenizer.vocab.values()) < vocab_size
+
  reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
  byte_encoder = bytes_to_unicode()
  byte_decoder = {v: k for k, v in byte_encoder.items()}
diff --git a/convert-starcoder-hf-to-gguf.py b/convert-starcoder-hf-to-gguf.py

index 48e88a777fea1db20a1f42633203d326578de16e..f469beb81a9a3a1d19b9d0845499c75794ff2fc6 100755 (executable)
--- a/convert-starcoder-hf-to-gguf.py
+++ b/convert-starcoder-hf-to-gguf.py
@@ -118,26 +118,19 @@ print("gguf: get tokenizer metadata")
  
  tokens: list[bytearray] = []
  
-tokenizer_json_file = dir_model / 'tokenizer.json'
-if not tokenizer_json_file.is_file():
-    print(f'Error: Missing {tokenizer_json_file}', file = sys.stderr)
-    sys.exit(1)
-
  # gpt2 tokenizer
  gguf_writer.add_tokenizer_model("gpt2")
  
-with open(tokenizer_json_file, "r", encoding="utf-8") as f:
-    tokenizer_json = json.load(f)
-
  print("gguf: get gpt2 tokenizer vocab")
  
-# The number of tokens in tokenizer.json can differ from the expected vocab size.
-# This causes downstream issues with mismatched tensor sizes when running the inference
-vocab_size = hparams["vocab_size"] if "vocab_size" in hparams else len(tokenizer_json["model"]["vocab"])
-
  # ref: https://github.com/cmp-nct/ggllm.cpp/blob/master/falcon_convert.py
  tokenizer = AutoTokenizer.from_pretrained(dir_model)
  
+# The number of tokens in tokenizer.json can differ from the expected vocab size.
+# This causes downstream issues with mismatched tensor sizes when running the inference
+vocab_size = hparams.get("vocab_size", len(tokenizer.vocab))
+assert max(tokenizer.vocab.values()) < vocab_size
+
  reverse_vocab = {id: encoded_tok for encoded_tok, id in tokenizer.vocab.items()}
  byte_encoder = bytes_to_unicode()
  byte_decoder = {v: k for k, v in byte_encoder.items()}
author	cebtenzzre <redacted>
	Mon, 2 Oct 2023 22:07:24 +0000 (18:07 -0400)
committer	GitHub <redacted>
	Mon, 2 Oct 2023 22:07:24 +0000 (18:07 -0400)
convert-falcon-hf-to-gguf.py		patch \| blob \| history
convert-gptneox-hf-to-gguf.py		patch \| blob \| history
convert-starcoder-hf-to-gguf.py		patch \| blob \| history