From: Georgi Gerganov Date: Wed, 16 Nov 2022 17:21:43 +0000 (+0200) Subject: models : simplify the conversion script X-Git-Tag: upstream/1.7.4~1830 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=e70e5c8b53faec6abbf0170f1e9a195f4dfccaab;p=pkg%2Fggml%2Fsources%2Fwhisper.cpp models : simplify the conversion script "transformers" dependency is not actually needed --- diff --git a/models/convert-pt-to-ggml.py b/models/convert-pt-to-ggml.py index 83fcd37f..ef4759f6 100644 --- a/models/convert-pt-to-ggml.py +++ b/models/convert-pt-to-ggml.py @@ -40,8 +40,8 @@ import code import torch import numpy as np -from transformers import GPTJForCausalLM -from transformers import GPT2TokenizerFast +#from transformers import GPTJForCausalLM +#from transformers import GPT2TokenizerFast # ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 LANGUAGES = { @@ -146,25 +146,25 @@ LANGUAGES = { "su": "sundanese", } -# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292 -def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"): - os.environ["TOKENIZERS_PARALLELISM"] = "false" - path = os.path.join(path_to_whisper_repo, "whisper/assets", name) - tokenizer = GPT2TokenizerFast.from_pretrained(path) - - specials = [ - "<|startoftranscript|>", - *[f"<|{lang}|>" for lang in LANGUAGES.keys()], - "<|translate|>", - "<|transcribe|>", - "<|startoflm|>", - "<|startofprev|>", - "<|nocaptions|>", - "<|notimestamps|>", - ] - - tokenizer.add_special_tokens(dict(additional_special_tokens=specials)) - return tokenizer +## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292 +#def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"): +# os.environ["TOKENIZERS_PARALLELISM"] = "false" +# path = os.path.join(path_to_whisper_repo, "whisper/assets", name) +# tokenizer = GPT2TokenizerFast.from_pretrained(path) +# +# specials = [ +# "<|startoftranscript|>", +# *[f"<|{lang}|>" for lang in LANGUAGES.keys()], +# "<|translate|>", +# "<|transcribe|>", +# "<|startoflm|>", +# "<|startofprev|>", +# "<|nocaptions|>", +# "<|notimestamps|>", +# ] +# +# tokenizer.add_special_tokens(dict(additional_special_tokens=specials)) +# return tokenizer # ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py def bytes_to_unicode(): @@ -224,12 +224,12 @@ with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as #code.interact(local=locals()) multilingual = hparams["n_vocab"] == 51865 -tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2") +dir_tokenizer = os.path.join(dir_whisper, "whisper/assets", multilingual and "multilingual" or "gpt2") +#tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2") #print(tokenizer) #print(tokenizer.name_or_path) #print(len(tokenizer.additional_special_tokens)) -dir_tokenizer = tokenizer.name_or_path # output in the same directory as the model fname_out = dir_out + "/ggml-model.bin"