--- /dev/null
+import io
+import os
+import sys
+import struct
+import json
+import code
+import torch
+import numpy as np
+
+from transformers import WhisperForConditionalGeneration
+
+conv_map = {'self_attn_layer_norm': 'attn_ln',
+ 'encoder_attn.k_proj': 'attn.key',
+ 'self_attn.out_proj': 'attn.out',
+ 'encoder_attn.out_proj': 'cross_attn.out',
+ 'self_attn.q_proj': 'attn.query',
+ 'encoder_attn.q_proj': 'cross_attn.query',
+ 'self_attn.v_proj': 'attn.value',
+ 'encoder_attn.v_proj': 'cross_attn.value',
+ 'encoder_attn_layer_norm': 'cross_attn_ln',
+ 'fc1': 'mlp.0',
+ 'fc2': 'mlp.2',
+ 'final_layer_norm': 'mlp_ln',
+ 'encoder.layer_norm.bias': 'encoder.ln_post.bias',
+ 'encoder.layer_norm.weight': 'encoder.ln_post.weight',
+ 'encoder.embed_positions.weight': 'encoder.positional_embedding',
+ 'decoder.layer_norm.bias': 'decoder.ln.bias',
+ 'decoder.layer_norm.weight': 'decoder.ln.weight',
+ 'decoder.embed_positions.weight': 'decoder.positional_embedding',
+ 'decoder.embed_tokens.weight': 'decoder.token_embedding.weight',
+}
+
+# ref: https://github.com/openai/gpt-2/blob/master/src/encoder.py
+def bytes_to_unicode():
+ """
+ Returns list of utf-8 byte and a corresponding list of unicode strings.
+ The reversible bpe codes work on unicode strings.
+ This means you need a large # of unicode characters in your vocab if you want to avoid UNKs.
+ When you're at something like a 10B token dataset you end up needing around 5K for decent coverage.
+ This is a signficant percentage of your normal, say, 32K bpe vocab.
+ To avoid that, we want lookup tables between utf-8 bytes and unicode strings.
+ And avoids mapping to whitespace/control characters the bpe code barfs on.
+ """
+ bs = list(range(ord("!"), ord("~")+1))+list(range(ord("¡"), ord("¬")+1))+list(range(ord("®"), ord("ÿ")+1))
+ cs = bs[:]
+ n = 0
+ for b in range(2**8):
+ if b not in bs:
+ bs.append(b)
+ cs.append(2**8+n)
+ n += 1
+ cs = [chr(n) for n in cs]
+ return dict(zip(bs, cs))
+
+if len(sys.argv) < 4:
+ print("Usage: convert-h5-to-ggml.py dir_model path-to-whisper-repo dir-output [use-f32]\n")
+ sys.exit(1)
+
+dir_model = sys.argv[1]
+dir_whisper = sys.argv[2]
+dir_out = sys.argv[3]
+
+with open(dir_model + "/vocab.json", "r") as f:
+ encoder = json.load(f)
+with open(dir_model + "/added_tokens.json", "r") as f:
+ encoder_added = json.load(f)
+with open(dir_model + "/config.json", "r") as f:
+ hparams = json.load(f)
+
+model = WhisperForConditionalGeneration.from_pretrained(dir_model)
+
+#code.interact(local=locals())
+
+n_mels = hparams["num_mel_bins"]
+with np.load(os.path.join(dir_whisper, "whisper/assets", "mel_filters.npz")) as f:
+ filters = torch.from_numpy(f[f"mel_{n_mels}"])
+
+dir_tokenizer = dir_model
+
+fname_out = dir_out + "/ggml-model.bin"
+
+with open(dir_tokenizer + "/vocab.json", "r", encoding="utf8") as f:
+ tokens = json.load(f)
+
+
+use_f16 = True
+
+fout = open(fname_out, "wb")
+
+fout.write(struct.pack("i", 0x67676d6c)) # magic: ggml in hex
+fout.write(struct.pack("i", hparams["vocab_size"]))
+fout.write(struct.pack("i", hparams["max_source_positions"]))
+fout.write(struct.pack("i", hparams["d_model"]))
+fout.write(struct.pack("i", hparams["decoder_attention_heads"]))
+fout.write(struct.pack("i", hparams["decoder_layers"]))
+fout.write(struct.pack("i", hparams["max_length"]))
+fout.write(struct.pack("i", hparams["d_model"]))
+fout.write(struct.pack("i", hparams["encoder_attention_heads"]))
+fout.write(struct.pack("i", hparams["encoder_layers"]))
+fout.write(struct.pack("i", hparams["num_mel_bins"]))
+fout.write(struct.pack("i", use_f16))
+
+fout.write(struct.pack("i", filters.shape[0]))
+fout.write(struct.pack("i", filters.shape[1]))
+for i in range(filters.shape[0]):
+ for j in range(filters.shape[1]):
+ fout.write(struct.pack("f", filters[i][j]))
+
+byte_encoder = bytes_to_unicode()
+byte_decoder = {v:k for k, v in byte_encoder.items()}
+
+fout.write(struct.pack("i", len(tokens)))
+
+tokens = sorted(tokens.items(), key=lambda x: x[1])
+for key in tokens:
+ text = bytearray([byte_decoder[c] for c in key[0]])
+ fout.write(struct.pack("i", len(text)))
+ fout.write(text)
+
+list_vars = model.state_dict()
+for name in list_vars.keys():
+ if name == "proj_out.weight":
+ print('Skipping', name)
+ continue
+
+ src = name
+
+ nn = name
+ nn = nn.split(".")[1:]
+ if nn[1] == "layers":
+ nn[1] = "blocks"
+ if ".".join(nn[3:-1]) == "self_attn.k_proj":
+ mapped = "attn.key" if nn[0] == "encoder" else "cross_attn.key"
+ else:
+ mapped = conv_map[".".join(nn[3:-1])]
+ name = ".".join(nn[:3] + [mapped] + nn[-1:])
+ else:
+ name = ".".join(nn)
+ name = conv_map[name] if name in conv_map else name
+
+ print(src, ' -> ', name)
+ data = list_vars[src].squeeze().numpy()
+ data = data.astype(np.float16)
+
+ # reshape conv bias from [n] to [n, 1]
+ if name == "encoder.conv1.bias" or \
+ name == "encoder.conv2.bias":
+ data = data.reshape(data.shape[0], 1)
+ print(" Reshaped variable: " + name + " to shape: ", data.shape)
+
+ n_dims = len(data.shape)
+ print(name, n_dims, data.shape)
+
+ # looks like the whisper models are in f16 by default
+ # so we need to convert the small tensors to f32 until we fully support f16 in ggml
+ # ftype == 0 -> float32, ftype == 1 -> float16
+ ftype = 1;
+ if use_f16:
+ if n_dims < 2 or \
+ name == "encoder.conv1.bias" or \
+ name == "encoder.conv2.bias" or \
+ name == "encoder.positional_embedding" or \
+ name == "decoder.positional_embedding":
+ print(" Converting to float32")
+ data = data.astype(np.float32)
+ ftype = 0
+ else:
+ data = data.astype(np.float32)
+ ftype = 0
+
+ # header
+ str = name.encode('utf-8')
+ fout.write(struct.pack("iii", n_dims, len(str), ftype))
+ for i in range(n_dims):
+ fout.write(struct.pack("i", data.shape[n_dims - 1 - i]))
+ fout.write(str);
+
+ # data
+ data.tofile(fout)
+
+fout.close()
+
+print("Done. Output file: " + fname_out)
+print("")