From: Sang-Kil Park Date: Mon, 29 Jan 2024 09:24:19 +0000 (+0900) Subject: py : improve BPE tokenizer support (#5189) X-Git-Tag: upstream/0.0.4488~2485 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=e76627bcce9f77adb6034cb127b7ec93d4287b69;p=pkg%2Fggml%2Fsources%2Fllama.cpp py : improve BPE tokenizer support (#5189) --- diff --git a/convert.py b/convert.py index 06768033..b48afba1 100755 --- a/convert.py +++ b/convert.py @@ -334,7 +334,10 @@ class Params: class BpeVocab: def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None: self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read()) - self.vocab = self.bpe_tokenizer["model"]["vocab"] + try: + self.vocab = self.bpe_tokenizer["model"]["vocab"] + except: + self.vocab = self.bpe_tokenizer added_tokens: dict[str, int] if fname_added_tokens is not None: # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.