From: Georgi Gerganov Date: Sat, 22 Oct 2022 15:17:08 +0000 (+0300) Subject: Update README.md and finalize the whisper.wasm example X-Git-Tag: upstream/1.7.4~1925 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=6b45e37b2b4a536f6e7c7df8f7c15d94623df2e9;p=pkg%2Fggml%2Fsources%2Fwhisper.cpp Update README.md and finalize the whisper.wasm example --- diff --git a/CMakeLists.txt b/CMakeLists.txt index c1e44a48..f485cf52 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -124,6 +124,7 @@ else() set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} /arch:AVX2 /D_CRT_SECURE_NO_WARNINGS=1") else() if (EMSCRIPTEN) + # we require support for WASM SIMD 128-bit set(CMAKE_C_FLAGS "${CMAKE_C_FLAGS} -pthread -msimd128") set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -pthread") else() diff --git a/Makefile b/Makefile index 931e2d11..8f8cbbed 100644 --- a/Makefile +++ b/Makefile @@ -90,7 +90,7 @@ libwhisper.a: ggml.o whisper.o ar rcs libwhisper.a ggml.o whisper.o clean: - rm -f *.o main libwhisper.a + rm -f *.o main stream libwhisper.a # # Examples diff --git a/README.md b/README.md index 6991278e..10c0e3c3 100644 --- a/README.md +++ b/README.md @@ -289,7 +289,7 @@ You can download the converted models using the [download-ggml-model.sh](downloa https://ggml.ggerganov.com -For more details, see the conversion script [convert-pt-to-ggml.py](convert-pt-to-ggml.py) or the README in [models](models). +For more details, see the conversion script [models/convert-pt-to-ggml.py](models/convert-pt-to-ggml.py) or the README in [models](models). ## Bindings diff --git a/convert-pt-to-ggml.py b/convert-pt-to-ggml.py deleted file mode 100644 index 9e9b2dce..00000000 --- a/convert-pt-to-ggml.py +++ /dev/null @@ -1,328 +0,0 @@ -# Convert Whisper transformer model from PyTorch to ggml format -# -# Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium -# -# You need to clone the original repo in ~/path/to/repo/whisper/ -# -# git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ -# -# It is used to various assets needed by the algorithm: -# -# - tokenizer -# - mel filters -# -# Also, you need to have the original models in ~/.cache/whisper/ -# See the original repo for more details. -# -# This script loads the specified model and whisper assets and saves them in ggml format. -# The output is a single binary file containing the following information: -# -# - hparams -# - mel filters -# - tokenizer vocab -# - model variables -# -# For each variable, write the following: -# -# - Number of dimensions (int) -# - Name length (int) -# - Dimensions (int[n_dims]) -# - Name (char[name_length]) -# - Data (float[n_dims]) -# - -import io -import os -import sys -import struct -import json -import code -import torch -import numpy as np - -from transformers import GPTJForCausalLM -from transformers import GPT2TokenizerFast - -# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 -LANGUAGES = { - "en": "english", - "zh": "chinese", - "de": "german", - "es": "spanish", - "ru": "russian", - "ko": "korean", - "fr": "french", - "ja": "japanese", - "pt": "portuguese", - "tr": "turkish", - "pl": "polish", - "ca": "catalan", - "nl": "dutch", - "ar": "arabic", - "sv": "swedish", - "it": "italian", - "id": "indonesian", - "hi": "hindi", - "fi": "finnish", - "vi": "vietnamese", - "iw": "hebrew", - "uk": "ukrainian", - "el": "greek", - "ms": "malay", - "cs": "czech", - "ro": "romanian", - "da": "danish", - "hu": "hungarian", - "ta": "tamil", - "no": "norwegian", - "th": "thai", - "ur": "urdu", - "hr": "croatian", - "bg": "bulgarian", - "lt": "lithuanian", - "la": "latin", - "mi": "maori", - "ml": "malayalam", - "cy": "welsh", - "sk": "slovak", - "te": "telugu", - "fa": "persian", - "lv": "latvian", - "bn": "bengali", - "sr": "serbian", - "az": "azerbaijani", - "sl": "slovenian", - "kn": "kannada", - "et": "estonian", - "mk": "macedonian", - "br": "breton", - "eu": "basque", - "is": "icelandic", - "hy": "armenian", - "ne": "nepali", - "mn": "mongolian", - "bs": "bosnian", - "kk": "kazakh", - "sq": "albanian", - "sw": "swahili", - "gl": "galician", - "mr": "marathi", - "pa": "punjabi", - "si": "sinhala", - "km": "khmer", - "sn": "shona", - "yo": "yoruba", - "so": "somali", - "af": "afrikaans", - "oc": "occitan", - "ka": "georgian", - "be": "belarusian", - "tg": "tajik", - "sd": "sindhi", - "gu": "gujarati", - "am": "amharic", - "yi": "yiddish", - "lo": "lao", - "uz": "uzbek", - "fo": "faroese", - "ht": "haitian creole", - "ps": "pashto", - "tk": "turkmen", - "nn": "nynorsk", - "mt": "maltese", - "sa": "sanskrit", - "lb": "luxembourgish", - "my": "myanmar", - "bo": "tibetan", - "tl": "tagalog", - "mg": "malagasy", - "as": "assamese", - "tt": "tatar", - "haw": "hawaiian", - "ln": "lingala", - "ha": "hausa", - "ba": "bashkir", - "jw": "javanese", - "su": "sundanese", -} - -# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292 -def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"): - os.environ["TOKENIZERS_PARALLELISM"] = "false" - path = os.path.join(path_to_whisper_repo, "whisper/assets", name) - tokenizer = GPT2TokenizerFast.from_pretrained(path) - - specials = [ - "<|startoftranscript|>", - *[f"<|{lang}|>" for lang in LANGUAGES.keys()], - "<|translate|>", - "<|transcribe|>", - "<|startoflm|>", - "<|startofprev|>", - "<|nocaptions|>", - "<|notimestamps|>", - ] - - tokenizer.add_special_tokens(dict(additional_special_tokens=specials)) - return tokenizer - -# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py -def bytes_to_unicode(): - """ - Returns list of utf-8 byte and a corresponding list of unicode strings. - The reversible bpe codes work on unicode strings. - This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. - When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. - This is a signficant percentage of your normal, say, 32K bpe vocab. - To avoid that, we want lookup tables between utf-8 bytes and unicode strings. - And avoids mapping to whitespace/control characters the bpe code barfs on. - """ - bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) - cs = bs[:] - n = 0 - for b in range(2**8): - if b not in bs: - bs.append(b) - cs.append(2**8+n) - n += 1 - cs = [chr(n) for n in cs] - return dict(zip(bs, cs)) - - -if len(sys.argv) < 4: - print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n") - sys.exit(1) - -fname_inp = sys.argv[1] -dir_whisper = sys.argv[2] -dir_out = sys.argv[3] - -# try to load PyTorch binary data -try: - model_bytes = open(fname_inp, "rb").read() - with io.BytesIO(model_bytes) as fp: - checkpoint = torch.load(fp, map_location="cpu") -except: - print("Error: failed to load PyTorch model file: %s" % fname_inp) - sys.exit(1) - -hparams = checkpoint["dims"] -print("hparams:", hparams) - -list_vars = checkpoint["model_state_dict"] - -#print(list_vars['encoder.positional_embedding']) -#print(list_vars['encoder.conv1.weight']) -#print(list_vars['encoder.conv1.weight'].shape) - -# load mel filters -n_mels = hparams["n_mels"] -with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as f: - filters = torch.from_numpy(f[f"mel_{n_mels}"]) - #print (filters) - -#code.interact(local=locals()) - -multilingual = hparams["n_vocab"] == 51865 -tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2") - -#print(tokenizer) -#print(tokenizer.name_or_path) -#print(len(tokenizer.additional_special_tokens)) -dir_tokenizer = tokenizer.name_or_path - -# output in the same directory as the model -fname_out = dir_out + "/ggml-model.bin" - -with open(dir_tokenizer + "/vocab.json", "r") as f: - tokens = json.load(f) - -# use 16-bit or 32-bit floats -use_f16 = True -if len(sys.argv) > 4: - use_f16 = False - fname_out = dir_out + "/ggml-model-f32.bin" - -fout = open(fname_out, "wb") - -fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex -fout.write(struct.pack("i", hparams["n_vocab"])) -fout.write(struct.pack("i", hparams["n_audio_ctx"])) -fout.write(struct.pack("i", hparams["n_audio_state"])) -fout.write(struct.pack("i", hparams["n_audio_head"])) -fout.write(struct.pack("i", hparams["n_audio_layer"])) -fout.write(struct.pack("i", hparams["n_text_ctx"])) -fout.write(struct.pack("i", hparams["n_text_state"])) -fout.write(struct.pack("i", hparams["n_text_head"])) -fout.write(struct.pack("i", hparams["n_text_layer"])) -fout.write(struct.pack("i", hparams["n_mels"])) -fout.write(struct.pack("i", use_f16)) - -# write mel filters -fout.write(struct.pack("i", filters.shape[0])) -fout.write(struct.pack("i", filters.shape[1])) -for i in range(filters.shape[0]): - for j in range(filters.shape[1]): - fout.write(struct.pack("f", filters[i][j])) - -byte_encoder = bytes_to_unicode() -byte_decoder = {v:k for k, v in byte_encoder.items()} - -fout.write(struct.pack("i", len(tokens))) - -for key in tokens: - text = bytearray([byte_decoder[c] for c in key]) - fout.write(struct.pack("i", len(text))) - fout.write(text) - -for name in list_vars.keys(): - data = list_vars[name].squeeze().numpy() - print("Processing variable: " + name + " with shape: ", data.shape) - - # reshape conv bias from [n] to [n, 1] - if name == "encoder.conv1.bias" or \ - name == "encoder.conv2.bias": - data = data.reshape(data.shape[0], 1) - print(" Reshaped variable: " + name + " to shape: ", data.shape) - - n_dims = len(data.shape); - - # looks like the whisper models are in f16 by default - # so we need to convert the small tensors to f32 until we fully support f16 in ggml - # ftype == 0 -> float32, ftype == 1 -> float16 - ftype = 1; - if use_f16: - if n_dims < 2 or \ - name == "encoder.conv1.bias" or \ - name == "encoder.conv2.bias" or \ - name == "encoder.positional_embedding" or \ - name == "decoder.positional_embedding": - ftype = 0 - data = data.astype(np.float32) - print(" Converting to float32") - data = data.astype(np.float32) - ftype = 0 - else: - data = data.astype(np.float32) - ftype = 0 - - #if name.startswith("encoder"): - # if name.endswith("mlp.0.weight") or \ - # name.endswith("mlp.2.weight"): - # print(" Transposing") - # data = data.transpose() - - # header - str = name.encode('utf-8') - fout.write(struct.pack("iii", n_dims, len(str), ftype)) - for i in range(n_dims): - fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) - fout.write(str); - - # data - data.tofile(fout) - -fout.close() - -print("Done. Output file: " + fname_out) -print("") diff --git a/examples/whisper.wasm/README.md b/examples/whisper.wasm/README.md index a1178cd2..645bab8a 100644 --- a/examples/whisper.wasm/README.md +++ b/examples/whisper.wasm/README.md @@ -1,3 +1,27 @@ # whisper.wasm -Live demo: https://whisper.ggerganov.com +Inference of [OpenAI's Whisper ASR model](https://github.com/openai/whisper) inside the browser + +This example uses a WebAssembly (WASM) port of the [whisper.cpp](https://github.com/ggerganov/whisper.cpp) +implementation of the transformer to run the inference inside a web page. The audio data does not leave your computer - +it is processed locally on your machine. The performance is not great but you should be able to achieve x2 or x3 +real-time for the `tiny` and `base` models on a modern CPU and browser (i.e. transcribe a 60 seconds audio in about +~20-30 seconds). + +This WASM port utilizes [WASM SIMD 128-bit intrinsics](https://emcc.zcopy.site/docs/porting/simd/) so you have to make +sure that [your browser supports them](https://webassembly.org/roadmap/). + +The example is capable of running all models up to size `small` inclusive. Beyond that, the memory requirements and +performance are unsatisfactory. The implementation currently support only the `Greedy` sampling strategy. Both +transcription and translation are supported. + +Since the model data is quite big (74MB for the `tiny` model) you need to manually load the model into the web-page. + +The example supports both loading audio from a file and recording audio from the microphone. The maximum length of the +audio is limited to 120 seconds. + +## Live demo + +Link: https://whisper.ggerganov.com + +![image](https://user-images.githubusercontent.com/1991296/197348344-1a7fead8-3dae-4922-8b06-df223a206603.png) diff --git a/examples/whisper.wasm/index-tmpl.html b/examples/whisper.wasm/index-tmpl.html index 1f340e5b..0e0d0ff0 100644 --- a/examples/whisper.wasm/index-tmpl.html +++ b/examples/whisper.wasm/index-tmpl.html @@ -162,7 +162,7 @@ -

+
@@ -254,6 +254,10 @@ return new type(buffer); } + // + // load model + // + function loadFile(event, fname) { var file = event.target.files[0] || null; if (file == null) { @@ -281,6 +285,10 @@ reader.readAsArrayBuffer(file); } + // + // audio file + // + function loadAudio(event) { if (!context) { context = new AudioContext({sampleRate: 16000}); @@ -327,7 +335,7 @@ } // - // Microphone + // microphone // var mediaRecorder = null; diff --git a/extra/convert-all.sh b/extra/convert-all.sh index 37a87213..20801af5 100755 --- a/extra/convert-all.sh +++ b/extra/convert-all.sh @@ -3,6 +3,6 @@ models=( "tiny.en" "tiny" "base.en" "base" "small.en" "small" "medium.en" "medium" "large" ) for model in "${models[@]}"; do - python3 convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/ + python3 models/convert-pt-to-ggml.py ~/.cache/whisper/$model.pt ../whisper models/ mv -v models/ggml-model.bin models/ggml-$model.bin done diff --git a/models/convert-pt-to-ggml.py b/models/convert-pt-to-ggml.py new file mode 100644 index 00000000..9e9b2dce --- /dev/null +++ b/models/convert-pt-to-ggml.py @@ -0,0 +1,328 @@ +# Convert Whisper transformer model from PyTorch to ggml format +# +# Usage: python convert-pt-to-ggml.py ~/.cache/whisper/medium.pt ~/path/to/repo/whisper/ ./models/whisper-medium +# +# You need to clone the original repo in ~/path/to/repo/whisper/ +# +# git clone https://github.com/openai/whisper ~/path/to/repo/whisper/ +# +# It is used to various assets needed by the algorithm: +# +# - tokenizer +# - mel filters +# +# Also, you need to have the original models in ~/.cache/whisper/ +# See the original repo for more details. +# +# This script loads the specified model and whisper assets and saves them in ggml format. +# The output is a single binary file containing the following information: +# +# - hparams +# - mel filters +# - tokenizer vocab +# - model variables +# +# For each variable, write the following: +# +# - Number of dimensions (int) +# - Name length (int) +# - Dimensions (int[n_dims]) +# - Name (char[name_length]) +# - Data (float[n_dims]) +# + +import io +import os +import sys +import struct +import json +import code +import torch +import numpy as np + +from transformers import GPTJForCausalLM +from transformers import GPT2TokenizerFast + +# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110 +LANGUAGES = { + "en": "english", + "zh": "chinese", + "de": "german", + "es": "spanish", + "ru": "russian", + "ko": "korean", + "fr": "french", + "ja": "japanese", + "pt": "portuguese", + "tr": "turkish", + "pl": "polish", + "ca": "catalan", + "nl": "dutch", + "ar": "arabic", + "sv": "swedish", + "it": "italian", + "id": "indonesian", + "hi": "hindi", + "fi": "finnish", + "vi": "vietnamese", + "iw": "hebrew", + "uk": "ukrainian", + "el": "greek", + "ms": "malay", + "cs": "czech", + "ro": "romanian", + "da": "danish", + "hu": "hungarian", + "ta": "tamil", + "no": "norwegian", + "th": "thai", + "ur": "urdu", + "hr": "croatian", + "bg": "bulgarian", + "lt": "lithuanian", + "la": "latin", + "mi": "maori", + "ml": "malayalam", + "cy": "welsh", + "sk": "slovak", + "te": "telugu", + "fa": "persian", + "lv": "latvian", + "bn": "bengali", + "sr": "serbian", + "az": "azerbaijani", + "sl": "slovenian", + "kn": "kannada", + "et": "estonian", + "mk": "macedonian", + "br": "breton", + "eu": "basque", + "is": "icelandic", + "hy": "armenian", + "ne": "nepali", + "mn": "mongolian", + "bs": "bosnian", + "kk": "kazakh", + "sq": "albanian", + "sw": "swahili", + "gl": "galician", + "mr": "marathi", + "pa": "punjabi", + "si": "sinhala", + "km": "khmer", + "sn": "shona", + "yo": "yoruba", + "so": "somali", + "af": "afrikaans", + "oc": "occitan", + "ka": "georgian", + "be": "belarusian", + "tg": "tajik", + "sd": "sindhi", + "gu": "gujarati", + "am": "amharic", + "yi": "yiddish", + "lo": "lao", + "uz": "uzbek", + "fo": "faroese", + "ht": "haitian creole", + "ps": "pashto", + "tk": "turkmen", + "nn": "nynorsk", + "mt": "maltese", + "sa": "sanskrit", + "lb": "luxembourgish", + "my": "myanmar", + "bo": "tibetan", + "tl": "tagalog", + "mg": "malagasy", + "as": "assamese", + "tt": "tatar", + "haw": "hawaiian", + "ln": "lingala", + "ha": "hausa", + "ba": "bashkir", + "jw": "javanese", + "su": "sundanese", +} + +# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292 +def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"): + os.environ["TOKENIZERS_PARALLELISM"] = "false" + path = os.path.join(path_to_whisper_repo, "whisper/assets", name) + tokenizer = GPT2TokenizerFast.from_pretrained(path) + + specials = [ + "<|startoftranscript|>", + *[f"<|{lang}|>" for lang in LANGUAGES.keys()], + "<|translate|>", + "<|transcribe|>", + "<|startoflm|>", + "<|startofprev|>", + "<|nocaptions|>", + "<|notimestamps|>", + ] + + tokenizer.add_special_tokens(dict(additional_special_tokens=specials)) + return tokenizer + +# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py +def bytes_to_unicode(): + """ + Returns list of utf-8 byte and a corresponding list of unicode strings. + The reversible bpe codes work on unicode strings. + This means you need a large # of unicode characters in your vocab if you want to avoid UNKs. + When you're at something like a 10B token dataset you end up needing around 5K for decent coverage. + This is a signficant percentage of your normal, say, 32K bpe vocab. + To avoid that, we want lookup tables between utf-8 bytes and unicode strings. + And avoids mapping to whitespace/control characters the bpe code barfs on. + """ + bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1)) + cs = bs[:] + n = 0 + for b in range(2**8): + if b not in bs: + bs.append(b) + cs.append(2**8+n) + n += 1 + cs = [chr(n) for n in cs] + return dict(zip(bs, cs)) + + +if len(sys.argv) < 4: + print("Usage: convert-pt-to-ggml.py model.pt path-to-whisper-repo dir-output [use-f32]\n") + sys.exit(1) + +fname_inp = sys.argv[1] +dir_whisper = sys.argv[2] +dir_out = sys.argv[3] + +# try to load PyTorch binary data +try: + model_bytes = open(fname_inp, "rb").read() + with io.BytesIO(model_bytes) as fp: + checkpoint = torch.load(fp, map_location="cpu") +except: + print("Error: failed to load PyTorch model file: %s" % fname_inp) + sys.exit(1) + +hparams = checkpoint["dims"] +print("hparams:", hparams) + +list_vars = checkpoint["model_state_dict"] + +#print(list_vars['encoder.positional_embedding']) +#print(list_vars['encoder.conv1.weight']) +#print(list_vars['encoder.conv1.weight'].shape) + +# load mel filters +n_mels = hparams["n_mels"] +with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as f: + filters = torch.from_numpy(f[f"mel_{n_mels}"]) + #print (filters) + +#code.interact(local=locals()) + +multilingual = hparams["n_vocab"] == 51865 +tokenizer = build_tokenizer(dir_whisper, multilingual and "multilingual" or "gpt2") + +#print(tokenizer) +#print(tokenizer.name_or_path) +#print(len(tokenizer.additional_special_tokens)) +dir_tokenizer = tokenizer.name_or_path + +# output in the same directory as the model +fname_out = dir_out + "/ggml-model.bin" + +with open(dir_tokenizer + "/vocab.json", "r") as f: + tokens = json.load(f) + +# use 16-bit or 32-bit floats +use_f16 = True +if len(sys.argv) > 4: + use_f16 = False + fname_out = dir_out + "/ggml-model-f32.bin" + +fout = open(fname_out, "wb") + +fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex +fout.write(struct.pack("i", hparams["n_vocab"])) +fout.write(struct.pack("i", hparams["n_audio_ctx"])) +fout.write(struct.pack("i", hparams["n_audio_state"])) +fout.write(struct.pack("i", hparams["n_audio_head"])) +fout.write(struct.pack("i", hparams["n_audio_layer"])) +fout.write(struct.pack("i", hparams["n_text_ctx"])) +fout.write(struct.pack("i", hparams["n_text_state"])) +fout.write(struct.pack("i", hparams["n_text_head"])) +fout.write(struct.pack("i", hparams["n_text_layer"])) +fout.write(struct.pack("i", hparams["n_mels"])) +fout.write(struct.pack("i", use_f16)) + +# write mel filters +fout.write(struct.pack("i", filters.shape[0])) +fout.write(struct.pack("i", filters.shape[1])) +for i in range(filters.shape[0]): + for j in range(filters.shape[1]): + fout.write(struct.pack("f", filters[i][j])) + +byte_encoder = bytes_to_unicode() +byte_decoder = {v:k for k, v in byte_encoder.items()} + +fout.write(struct.pack("i", len(tokens))) + +for key in tokens: + text = bytearray([byte_decoder[c] for c in key]) + fout.write(struct.pack("i", len(text))) + fout.write(text) + +for name in list_vars.keys(): + data = list_vars[name].squeeze().numpy() + print("Processing variable: " + name + " with shape: ", data.shape) + + # reshape conv bias from [n] to [n, 1] + if name == "encoder.conv1.bias" or \ + name == "encoder.conv2.bias": + data = data.reshape(data.shape[0], 1) + print(" Reshaped variable: " + name + " to shape: ", data.shape) + + n_dims = len(data.shape); + + # looks like the whisper models are in f16 by default + # so we need to convert the small tensors to f32 until we fully support f16 in ggml + # ftype == 0 -> float32, ftype == 1 -> float16 + ftype = 1; + if use_f16: + if n_dims < 2 or \ + name == "encoder.conv1.bias" or \ + name == "encoder.conv2.bias" or \ + name == "encoder.positional_embedding" or \ + name == "decoder.positional_embedding": + ftype = 0 + data = data.astype(np.float32) + print(" Converting to float32") + data = data.astype(np.float32) + ftype = 0 + else: + data = data.astype(np.float32) + ftype = 0 + + #if name.startswith("encoder"): + # if name.endswith("mlp.0.weight") or \ + # name.endswith("mlp.2.weight"): + # print(" Transposing") + # data = data.transpose() + + # header + str = name.encode('utf-8') + fout.write(struct.pack("iii", n_dims, len(str), ftype)) + for i in range(n_dims): + fout.write(struct.pack("i", data.shape[n_dims - 1 - i])) + fout.write(str); + + # data + data.tofile(fout) + +fout.close() + +print("Done. Output file: " + fname_out) +print("")