+# Convert Hugging Face fine-tuned models to ggml format
+#
+# Usage:
+#
+# git clone https://github.com/openai/whisper
+# git clone https://github.com/ggerganov/whisper.cpp
+# git clone https://huggingface.co/openai/whisper-medium
+#
+# python3 ./whisper.cpp/models/convert-h5-to-ggml.py ./whisper-medium/ ./whisper .
+#
+# This script is similar to "convert-pt-to-ggml.py"
+#
+# For more info:
+#
+# https://github.com/ggerganov/whisper.cpp/issues/157
+#
+
import io
import os
import sys
#from transformers import GPT2TokenizerFast
# ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L10-L110
-LANGUAGES = {
- "en": "english",
- "zh": "chinese",
- "de": "german",
- "es": "spanish",
- "ru": "russian",
- "ko": "korean",
- "fr": "french",
- "ja": "japanese",
- "pt": "portuguese",
- "tr": "turkish",
- "pl": "polish",
- "ca": "catalan",
- "nl": "dutch",
- "ar": "arabic",
- "sv": "swedish",
- "it": "italian",
- "id": "indonesian",
- "hi": "hindi",
- "fi": "finnish",
- "vi": "vietnamese",
- "iw": "hebrew",
- "uk": "ukrainian",
- "el": "greek",
- "ms": "malay",
- "cs": "czech",
- "ro": "romanian",
- "da": "danish",
- "hu": "hungarian",
- "ta": "tamil",
- "no": "norwegian",
- "th": "thai",
- "ur": "urdu",
- "hr": "croatian",
- "bg": "bulgarian",
- "lt": "lithuanian",
- "la": "latin",
- "mi": "maori",
- "ml": "malayalam",
- "cy": "welsh",
- "sk": "slovak",
- "te": "telugu",
- "fa": "persian",
- "lv": "latvian",
- "bn": "bengali",
- "sr": "serbian",
- "az": "azerbaijani",
- "sl": "slovenian",
- "kn": "kannada",
- "et": "estonian",
- "mk": "macedonian",
- "br": "breton",
- "eu": "basque",
- "is": "icelandic",
- "hy": "armenian",
- "ne": "nepali",
- "mn": "mongolian",
- "bs": "bosnian",
- "kk": "kazakh",
- "sq": "albanian",
- "sw": "swahili",
- "gl": "galician",
- "mr": "marathi",
- "pa": "punjabi",
- "si": "sinhala",
- "km": "khmer",
- "sn": "shona",
- "yo": "yoruba",
- "so": "somali",
- "af": "afrikaans",
- "oc": "occitan",
- "ka": "georgian",
- "be": "belarusian",
- "tg": "tajik",
- "sd": "sindhi",
- "gu": "gujarati",
- "am": "amharic",
- "yi": "yiddish",
- "lo": "lao",
- "uz": "uzbek",
- "fo": "faroese",
- "ht": "haitian creole",
- "ps": "pashto",
- "tk": "turkmen",
- "nn": "nynorsk",
- "mt": "maltese",
- "sa": "sanskrit",
- "lb": "luxembourgish",
- "my": "myanmar",
- "bo": "tibetan",
- "tl": "tagalog",
- "mg": "malagasy",
- "as": "assamese",
- "tt": "tatar",
- "haw": "hawaiian",
- "ln": "lingala",
- "ha": "hausa",
- "ba": "bashkir",
- "jw": "javanese",
- "su": "sundanese",
-}
+#LANGUAGES = {
+# "en": "english",
+# "zh": "chinese",
+# "de": "german",
+# "es": "spanish",
+# "ru": "russian",
+# "ko": "korean",
+# "fr": "french",
+# "ja": "japanese",
+# "pt": "portuguese",
+# "tr": "turkish",
+# "pl": "polish",
+# "ca": "catalan",
+# "nl": "dutch",
+# "ar": "arabic",
+# "sv": "swedish",
+# "it": "italian",
+# "id": "indonesian",
+# "hi": "hindi",
+# "fi": "finnish",
+# "vi": "vietnamese",
+# "iw": "hebrew",
+# "uk": "ukrainian",
+# "el": "greek",
+# "ms": "malay",
+# "cs": "czech",
+# "ro": "romanian",
+# "da": "danish",
+# "hu": "hungarian",
+# "ta": "tamil",
+# "no": "norwegian",
+# "th": "thai",
+# "ur": "urdu",
+# "hr": "croatian",
+# "bg": "bulgarian",
+# "lt": "lithuanian",
+# "la": "latin",
+# "mi": "maori",
+# "ml": "malayalam",
+# "cy": "welsh",
+# "sk": "slovak",
+# "te": "telugu",
+# "fa": "persian",
+# "lv": "latvian",
+# "bn": "bengali",
+# "sr": "serbian",
+# "az": "azerbaijani",
+# "sl": "slovenian",
+# "kn": "kannada",
+# "et": "estonian",
+# "mk": "macedonian",
+# "br": "breton",
+# "eu": "basque",
+# "is": "icelandic",
+# "hy": "armenian",
+# "ne": "nepali",
+# "mn": "mongolian",
+# "bs": "bosnian",
+# "kk": "kazakh",
+# "sq": "albanian",
+# "sw": "swahili",
+# "gl": "galician",
+# "mr": "marathi",
+# "pa": "punjabi",
+# "si": "sinhala",
+# "km": "khmer",
+# "sn": "shona",
+# "yo": "yoruba",
+# "so": "somali",
+# "af": "afrikaans",
+# "oc": "occitan",
+# "ka": "georgian",
+# "be": "belarusian",
+# "tg": "tajik",
+# "sd": "sindhi",
+# "gu": "gujarati",
+# "am": "amharic",
+# "yi": "yiddish",
+# "lo": "lao",
+# "uz": "uzbek",
+# "fo": "faroese",
+# "ht": "haitian creole",
+# "ps": "pashto",
+# "tk": "turkmen",
+# "nn": "nynorsk",
+# "mt": "maltese",
+# "sa": "sanskrit",
+# "lb": "luxembourgish",
+# "my": "myanmar",
+# "bo": "tibetan",
+# "tl": "tagalog",
+# "mg": "malagasy",
+# "as": "assamese",
+# "tt": "tatar",
+# "haw": "hawaiian",
+# "ln": "lingala",
+# "ha": "hausa",
+# "ba": "bashkir",
+# "jw": "javanese",
+# "su": "sundanese",
+#}
## ref: https://github.com/openai/whisper/blob/8cf36f3508c9acd341a45eb2364239a3d81458b9/whisper/tokenizer.py#L273-L292
#def build_tokenizer(path_to_whisper_repo: str, name: str = "gpt2"):