From: Kerfuffle <redacted>
Date: Wed, 4 Oct 2023 14:20:28 +0000 (-0600)
Subject: convert : fix Baichuan2 models by using vocab size in config.json (#3299)
X-Git-Tag: upstream/0.0.4488~3164
X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=019ba1dcd0c7775a5ac0f7442634a330eb0173cc;p=pkg%2Fggml%2Fsources%2Fllama.cpp

convert : fix Baichuan2 models by using vocab size in config.json (#3299)

Use local GGUF package when possible in Baichuan converter
---

diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py
index 8bd34dc4..513a7516 100755
--- a/convert-baichuan-hf-to-gguf.py
+++ b/convert-baichuan-hf-to-gguf.py
@@ -11,11 +11,14 @@ import sys
 from pathlib import Path
 from typing import TYPE_CHECKING, Any
 import itertools
-import gguf
 import numpy as np
 import torch
 from sentencepiece import SentencePieceProcessor  # type: ignore[import]
 
+if 'NO_LOCAL_GGUF' not in os.environ:
+    sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf'))
+import gguf
+
 
 if TYPE_CHECKING:
     from typing import TypeAlias
@@ -174,8 +177,11 @@ if not tokenizer_model_file.is_file():
 print("gguf: get sentencepiece tokenizer vocab, scores and token types")
 
 tokenizer = SentencePieceProcessor(str(tokenizer_model_file))
+vocab_size = hparams.get('vocab_size')
+if vocab_size is None:
+    vocab_size = tokenizer.vocab_size()
 
-for i in range(tokenizer.vocab_size()):
+for i in range(vocab_size):
     text: bytes
     score: float