]> git.djapps.eu Git - pkg/ggml/sources/llama.cpp/commitdiff
py : improve BPE tokenizer support (#5189)
authorSang-Kil Park <redacted>
Mon, 29 Jan 2024 09:24:19 +0000 (18:24 +0900)
committerGitHub <redacted>
Mon, 29 Jan 2024 09:24:19 +0000 (11:24 +0200)
convert.py

index 06768033da174bebeb6ade5cb42d0a32402a29c9..b48afba1e15d903b71cd0bdf20ad9d1b63983155 100755 (executable)
@@ -334,7 +334,10 @@ class Params:
 class BpeVocab:
     def __init__(self, fname_tokenizer: Path, fname_added_tokens: Path | None) -> None:
         self.bpe_tokenizer = json.loads(open(str(fname_tokenizer), encoding="utf-8").read())
-        self.vocab = self.bpe_tokenizer["model"]["vocab"]
+        try:
+            self.vocab = self.bpe_tokenizer["model"]["vocab"]
+        except:
+            self.vocab = self.bpe_tokenizer
         added_tokens: dict[str, int]
         if fname_added_tokens is not None:
             # FIXME: Verify that added tokens here _cannot_ overlap with the main vocab.