convert : fix Baichuan2 models by using vocab size in config.json (#3299)

author Kerfuffle <redacted>

Wed, 4 Oct 2023 14:20:28 +0000 (08:20 -0600)

committer GitHub <redacted>

Wed, 4 Oct 2023 14:20:28 +0000 (17:20 +0300)
author Kerfuffle <redacted>
Wed, 4 Oct 2023 14:20:28 +0000 (08:20 -0600)
committer GitHub <redacted>
Wed, 4 Oct 2023 14:20:28 +0000 (17:20 +0300)
diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py

index 8bd34dc440769b3ec5cf837402bd5d3d0d229a8c..513a7516a25f08b8830e3124608ee1748a3bd283 100755 (executable)
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -11,11 +11,14 @@ import sys
  from pathlib import Path
  from typing import TYPE_CHECKING, Any
  import itertools
-import gguf
  import numpy as np
  import torch
  from sentencepiece import SentencePieceProcessor  # type: ignore[import]
  
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
+
  
  if TYPE_CHECKING:
      from typing import TypeAlias
@@ -174,8 +177,11 @@ if not tokenizer_model_file.is_file():
  print("gguf: get sentencepiece tokenizer vocab, scores and token types")
  
  tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
+vocab_size = hparams.get('vocab_size')
+if vocab_size is None:
+    vocab_size = tokenizer.vocab_size()
  
-for i in range(tokenizer.vocab_size()):
+for i in range(vocab_size):
      text: bytes
      score: float