From: LostRuins Date: Thu, 13 Apr 2023 12:27:56 +0000 (+0800) Subject: gpt : fix pytorch converter text encodings (#78) X-Git-Tag: upstream/0.0.1642~1553 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=945685b2facccc144fe01d18a9fe64d3ddca23f7;p=pkg%2Fggml%2Fsources%2Fggml gpt : fix pytorch converter text encodings (#78) * Fixed quantization for f16 models not working - this is because the f16 tables were not initialized thus f16 to f32 conversion was failing. * On some situations, the script fails with the error : UnicodeDecodeError: 'charmap' codec can't decode byte (byte) in position (number) : character maps to This is probably because the encodings are incorrect. Explicitly specifying them as UTF-8 seems to resolve the issue and allow for correct conversion. --------- Co-authored-by: Georgi Gerganov --- diff --git a/examples/gpt-2/convert-cerebras-to-ggml.py b/examples/gpt-2/convert-cerebras-to-ggml.py index 6f20a542..7fba7cde 100644 --- a/examples/gpt-2/convert-cerebras-to-ggml.py +++ b/examples/gpt-2/convert-cerebras-to-ggml.py @@ -42,10 +42,10 @@ if len(sys.argv) < 2: dir_model = sys.argv[1] fname_out = sys.argv[1] + "/ggml-model-f16.bin" -with open(dir_model + "/vocab.json", "r") as f: +with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: encoder = json.load(f) -with open(dir_model + "/config.json", "r") as f: +with open(dir_model + "/config.json", "r", encoding="utf-8") as f: hparams = json.load(f) # use 16-bit or 32-bit floats diff --git a/examples/gpt-2/convert-ckpt-to-ggml.py b/examples/gpt-2/convert-ckpt-to-ggml.py index 60cd963d..9113141f 100644 --- a/examples/gpt-2/convert-ckpt-to-ggml.py +++ b/examples/gpt-2/convert-ckpt-to-ggml.py @@ -63,10 +63,10 @@ if len(sys.argv) < 3: dir_model = sys.argv[1] fname_out = sys.argv[1] + "/ggml-model.bin" -with open(dir_model + "/encoder.json", "r") as f: +with open(dir_model + "/encoder.json", "r", encoding="utf-8") as f: encoder = json.load(f) -with open(dir_model + "/hparams.json", "r") as f: +with open(dir_model + "/hparams.json", "r", encoding="utf-8") as f: hparams = json.load(f) # possible data types diff --git a/examples/gpt-2/convert-h5-to-ggml.py b/examples/gpt-2/convert-h5-to-ggml.py index 4e86ce27..6a2b8654 100644 --- a/examples/gpt-2/convert-h5-to-ggml.py +++ b/examples/gpt-2/convert-h5-to-ggml.py @@ -55,13 +55,13 @@ if len(sys.argv) < 2: dir_model = sys.argv[1] fname_out = sys.argv[1] + "/ggml-model.bin" -with open(dir_model + "/vocab.json", "r") as f: +with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: encoder = json.load(f) -with open(dir_model + "/added_tokens.json", "r") as f: +with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f: encoder_added = json.load(f) -with open(dir_model + "/config.json", "r") as f: +with open(dir_model + "/config.json", "r", encoding="utf-8") as f: hparams = json.load(f) # use 16-bit or 32-bit floats diff --git a/examples/gpt-j/convert-h5-to-ggml.py b/examples/gpt-j/convert-h5-to-ggml.py index e254f2cc..cb773172 100644 --- a/examples/gpt-j/convert-h5-to-ggml.py +++ b/examples/gpt-j/convert-h5-to-ggml.py @@ -57,13 +57,13 @@ if len(sys.argv) < 3: dir_model = sys.argv[1] fname_out = sys.argv[1] + "/ggml-model.bin" -with open(dir_model + "/vocab.json", "r") as f: +with open(dir_model + "/vocab.json", "r", encoding="utf-8") as f: encoder = json.load(f) -with open(dir_model + "/added_tokens.json", "r") as f: +with open(dir_model + "/added_tokens.json", "r", encoding="utf-8") as f: encoder_added = json.load(f) -with open(dir_model + "/config.json", "r") as f: +with open(dir_model + "/config.json", "r", encoding="utf-8") as f: hparams = json.load(f) # possible data types