tests : test-tokenizer-0.sh print more info (#7402)

author Georgi Gerganov <redacted>

Tue, 21 May 2024 16:53:48 +0000 (19:53 +0300)

committer Georgi Gerganov <redacted>

Tue, 21 May 2024 16:53:48 +0000 (19:53 +0300)
author Georgi Gerganov <redacted>
Tue, 21 May 2024 16:53:48 +0000 (19:53 +0300)
committer Georgi Gerganov <redacted>
Tue, 21 May 2024 16:53:48 +0000 (19:53 +0300)
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py

index 45404b32b75ae6843ddf05779cbfe389509924b0..1923b88ba2a802da2710ca82ebb3a56c7d658486 100755 (executable)
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -72,7 +72,7 @@ models = [
      {"name": "mpt",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mosaicml/mpt-7b", },
      {"name": "starcoder",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/bigcode/starcoder2-3b", },
      {"name": "gpt-2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/openai-community/gpt2", },
-    {"name": "stablelm",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
+    {"name": "stablelm2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b", },
      {"name": "refact",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/smallcloudai/Refact-1_6-base", },
      {"name": "command-r",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/CohereForAI/c4ai-command-r-v01", },
      {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py

index 1acf45bf2f48e427afdb789cac5a5e5d1e75ac10..6357d40348b34f989d2fe17c7b032478da49450b 100755 (executable)
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -447,7 +447,7 @@ class Model:
              # ref: https://huggingface.co/openai-community/gpt2
              res = "gpt-2"
          if chkhsh == "32d85c31273f8019248f2559fed492d929ea28b17e51d81d3bb36fff23ca72b3":
-            # ref: https://huggingface.co/stabilityai/stablelm-2-1_6b
+            # ref: https://huggingface.co/stabilityai/stablelm-2-zephyr-1_6b
              res = "stablelm2"
          if chkhsh == "6221ad2852e85ce96f791f476e0b390cf9b474c9e3d1362f53a24a06dc8220ff":
              # ref: https://huggingface.co/smallcloudai/Refact-1_6-base
diff --git a/tests/test-tokenizer-0.sh b/tests/test-tokenizer-0.sh

index 2fb8632d810c47307889ea74df7bbf6cb7697be0..1fec8bbf130dbddee2dab33479bde2468deeefc3 100755 (executable)
--- a/tests/test-tokenizer-0.sh
+++ b/tests/test-tokenizer-0.sh
@@ -17,10 +17,15 @@ make -j tests/test-tokenizer-0
  
  printf "Testing %s on %s ...\n" $name $input
  
+set -e
+
+printf "Tokenizing using (py)  Python AutoTokenizer ...\n"
  python3 ./tests/test-tokenizer-0.py ./models/tokenizers/$name --fname-tok $input > /tmp/test-tokenizer-0-$name-py.log 2>&1
-cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
  
+printf "Tokenizing using (cpp) llama.cpp ...\n"
  ./tests/test-tokenizer-0 ./models/ggml-vocab-$name.gguf $input > /tmp/test-tokenizer-0-$name-cpp.log 2>&1
+
+cat /tmp/test-tokenizer-0-$name-py.log | grep "tokenized in"
  cat /tmp/test-tokenizer-0-$name-cpp.log | grep "tokenized in"
  
  diff $input.tok $input.tokcpp > /dev/null 2>&1
author	Georgi Gerganov <redacted>
	Tue, 21 May 2024 16:53:48 +0000 (19:53 +0300)
committer	Georgi Gerganov <redacted>
	Tue, 21 May 2024 16:53:48 +0000 (19:53 +0300)
convert-hf-to-gguf-update.py		patch \| blob \| history
convert-hf-to-gguf.py		patch \| blob \| history
tests/test-tokenizer-0.sh		patch \| blob \| history