convert : use utf8 encoding (#7000)

author Georgi Gerganov <redacted>

Tue, 30 Apr 2024 08:05:25 +0000 (11:05 +0300)

committer GitHub <redacted>

Tue, 30 Apr 2024 08:05:25 +0000 (11:05 +0300)
author Georgi Gerganov <redacted>
Tue, 30 Apr 2024 08:05:25 +0000 (11:05 +0300)
committer GitHub <redacted>
Tue, 30 Apr 2024 08:05:25 +0000 (11:05 +0300)
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py

index 1c559c3f693be0be44d299d4163caa6880e0280e..b019c1e3dc59fd8f045b224f5028703069cb43dd 100644 (file)
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -128,7 +128,7 @@ for model in models:
      print(f"chkhsh: {chkhsh}")
  
      # print the "pre_tokenizer" content from the tokenizer.json
-    with open(f"models/tokenizers/{name}/tokenizer.json", "r") as f:
+    with open(f"models/tokenizers/{name}/tokenizer.json", "r", encoding="utf-8") as f:
          cfg = json.load(f)
          pre_tokenizer = cfg["pre_tokenizer"]
          print("pre_tokenizer: " + json.dumps(pre_tokenizer, indent=4))
@@ -156,15 +156,19 @@ src_func +=  "        print(f\"chkhsh: {chkhsh}\")\n"
  src_func +=  "\n"
  src_func +=  "        res = None\n"
  src_func +=  "\n"
-src_func +=  "        # NOTE: if you get an error here, you need to add the model to the if-elif chain below\n"
-src_func +=  "        #       don't do this manually - use the convert-hf-to-gguf-update.py script!\n"
+src_func +=  "        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script\n"
+src_func +=  "        #       or pull the latest version of the model from Huggingface\n"
+src_func +=  "        #       don't edit the hashes manually!\n"
  src_func += f"{src_ifs}\n"
  src_func +=  "        if res is None:\n"
  src_func +=  "            print(\"\\n\")\n"
  src_func +=  "            print(\"**************************************************************************************\")\n"
  src_func +=  "            print(\"** WARNING: The BPE pre-tokenizer was not recognized!\")\n"
-src_func +=  "            print(\"**          This means that it was not added yet or you are using an older version.\")\n"
-src_func +=  "            print(\"**          Check convert-hf-to-gguf-update.py and update it accordingly.\")\n"
+src_func +=  "            print(\"**          There are 2 possible reasons for this:\")\n"
+src_func +=  "            print(\"**          - the model has not been added to convert-hf-to-gguf-update.py yet\")\n"
+src_func +=  "            print(\"**          - the pre-tokenization config has changed upstream\")\n"
+src_func +=  "            print(\"**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.\")\n"
+src_func +=  "            print(\"** ref:     https://github.com/ggerganov/llama.cpp/pull/6920\")\n"
  src_func +=  "            print(\"**\")\n"
  src_func +=  "            print(f\"** chkhsh:  {chkhsh}\")\n"
  src_func +=  "            print(\"**************************************************************************************\")\n"
@@ -249,7 +253,7 @@ for model in models:
      from transformers import AutoTokenizer
      tokenizer = AutoTokenizer.from_pretrained(f"models/tokenizers/{name}")
  
-    with open(f"models/ggml-vocab-{name}.gguf.inp", "w") as f:
+    with open(f"models/ggml-vocab-{name}.gguf.inp", "w", encoding="utf-8") as f:
          for text in tests:
              f.write(f"{text}")
              f.write("\n__ggml_vocab_test__\n")
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py

index d1b8cef11277df401fa2b176803c140a5e9c1c33..2f146d7302a7811ff76e8e79cb03dcff0e1b01c5 100755 (executable)
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -279,8 +279,9 @@ class Model(ABC):
  
          res = None
  
-        # NOTE: if you get an error here, you need to add the model to the if-elif chain below
-        #       don't do this manually - use the convert-hf-to-gguf-update.py script!
+        # NOTE: if you get an error here, you need to update the convert-hf-to-gguf-update.py script
+        #       or pull the latest version of the model from Huggingface
+        #       don't edit the hashes manually!
          if chkhsh == "0ef9807a4087ebef797fc749390439009c3b9eda9ad1a097abbe738f486c01e5":
              # ref: https://huggingface.co/meta-llama/Meta-Llama-3-8B
              res = "llama-bpe"
@@ -310,8 +311,11 @@ class Model(ABC):
              print("\n")
              print("**************************************************************************************")
              print("** WARNING: The BPE pre-tokenizer was not recognized!")
-            print("**          This means that it was not added yet or you are using an older version.")
-            print("**          Check convert-hf-to-gguf-update.py and update it accordingly.")
+            print("**          There are 2 possible reasons for this:")
+            print("**          - the model has not been added to convert-hf-to-gguf-update.py yet")
+            print("**          - the pre-tokenization config has changed upstream")
+            print("**          Check your model files and convert-hf-to-gguf-update.py and update them accordingly.")
+            print("** ref:     https://github.com/ggerganov/llama.cpp/pull/6920")
              print("**")
              print(f"** chkhsh:  {chkhsh}")
              print("**************************************************************************************")
author	Georgi Gerganov <redacted>
	Tue, 30 Apr 2024 08:05:25 +0000 (11:05 +0300)
committer	GitHub <redacted>
	Tue, 30 Apr 2024 08:05:25 +0000 (11:05 +0300)
convert-hf-to-gguf-update.py		patch \| blob \| history
convert-hf-to-gguf.py		patch \| blob \| history