From: Kerfuffle Date: Wed, 4 Oct 2023 14:20:28 +0000 (-0600) Subject: convert : fix Baichuan2 models by using vocab size in config.json (#3299) X-Git-Tag: upstream/0.0.4488~3164 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=019ba1dcd0c7775a5ac0f7442634a330eb0173cc;p=pkg%2Fggml%2Fsources%2Fllama.cpp convert : fix Baichuan2 models by using vocab size in config.json (#3299) Use local GGUF package when possible in Baichuan converter --- diff --git a/convert-baichuan-hf-to-gguf.py b/convert-baichuan-hf-to-gguf.py index 8bd34dc4..513a7516 100755 --- a/convert-baichuan-hf-to-gguf.py +++ b/convert-baichuan-hf-to-gguf.py @@ -11,11 +11,14 @@ import sys from pathlib import Path from typing import TYPE_CHECKING, Any import itertools -import gguf import numpy as np import torch from sentencepiece import SentencePieceProcessor # type: ignore[import] +if 'NO_LOCAL_GGUF' not in os.environ: + sys.path.insert(1, str(Path(__file__).parent / 'gguf-py' / 'gguf')) +import gguf + if TYPE_CHECKING: from typing import TypeAlias @@ -174,8 +177,11 @@ if not tokenizer_model_file.is_file(): print("gguf: get sentencepiece tokenizer vocab, scores and token types") tokenizer = SentencePieceProcessor(str(tokenizer_model_file)) +vocab_size = hparams.get('vocab_size') +if vocab_size is None: + vocab_size = tokenizer.vocab_size() -for i in range(tokenizer.vocab_size()): +for i in range(vocab_size): text: bytes score: float