llama : rename jina tokenizers to v2 (#7249)

author Joan Fontanals <redacted>

Mon, 13 May 2024 08:35:14 +0000 (10:35 +0200)

committer GitHub <redacted>

Mon, 13 May 2024 08:35:14 +0000 (11:35 +0300)
author Joan Fontanals <redacted>
Mon, 13 May 2024 08:35:14 +0000 (10:35 +0200)
committer GitHub <redacted>
Mon, 13 May 2024 08:35:14 +0000 (11:35 +0300)
diff --git a/convert-hf-to-gguf-update.py b/convert-hf-to-gguf-update.py

index cd2674a0ea97d5d71be2f7abeea28ba14644eebb..14aa0c45a6a878c7954b1a7f1cfad114fcaf6b30 100755 (executable)
--- a/convert-hf-to-gguf-update.py
+++ b/convert-hf-to-gguf-update.py
@@ -74,9 +74,9 @@ models = [
      {"name": "qwen2",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/Qwen/Qwen1.5-7B", },
      {"name": "olmo",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/allenai/OLMo-1.7-7B-hf", },
      {"name": "dbrx",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/databricks/dbrx-base", },
-    {"name": "jina-en",        "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
-    {"name": "jina-es",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
-    {"name": "jina-de",        "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
+    {"name": "jina-v2-en",     "tokt": TOKENIZER_TYPE.WPM, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-en", }, # WPM!
+    {"name": "jina-v2-es",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-es", },
+    {"name": "jina-v2-de",     "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/jinaai/jina-embeddings-v2-base-de", },
  ]
  
  # make directory "models/tokenizers" if it doesn't exist
diff --git a/convert-hf-to-gguf.py b/convert-hf-to-gguf.py

index ec7f4dd758c72c67fd8b67dce9cdf1e95ccec05d..d6e5dece0a2c31a99bc74953944f5bfa35c1e076 100755 (executable)
--- a/convert-hf-to-gguf.py
+++ b/convert-hf-to-gguf.py
@@ -475,13 +475,13 @@ class Model:
              res = "dbrx"
          if chkhsh == "0876d13b50744004aa9aeae05e7b0647eac9d801b5ba4668afc01e709c15e19f":
              # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-en
-            res = "jina-en"
+            res = "jina-v2-en"
          if chkhsh == "171aeeedd6fb548d418a7461d053f11b6f1f1fc9b387bd66640d28a4b9f5c643":
              # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-es
-            res = "jina-es"
+            res = "jina-v2-es"
          if chkhsh == "27949a2493fc4a9f53f5b9b029c82689cfbe5d3a1929bb25e043089e28466de6":
              # ref: https://huggingface.co/jinaai/jina-embeddings-v2-base-de
-            res = "jina-de"
+            res = "jina-v2-de"
  
          if res is None:
              logger.warning("\n")
diff --git a/llama.cpp b/llama.cpp

index e91ad7285da9945b484661022b6b086bb5162102..adbcc07e20fc5079684fbba0b1c75e55a18490db 100644 (file)
--- a/llama.cpp
+++ b/llama.cpp
@@ -4424,7 +4424,9 @@ static void llm_load_vocab(
              } else if (
                      tokenizer_pre == "gpt-2"   ||
                      tokenizer_pre == "jina-es" ||
-                    tokenizer_pre == "jina-de") {
+                    tokenizer_pre == "jina-de" ||
+                    tokenizer_pre == "jina-v2-es" ||
+                    tokenizer_pre == "jina-v2-de") {
                  vocab.type_pre = LLAMA_VOCAB_PRE_TYPE_GPT2;
              } else if (
                      tokenizer_pre == "refact") {
author	Joan Fontanals <redacted>
	Mon, 13 May 2024 08:35:14 +0000 (10:35 +0200)
committer	GitHub <redacted>
	Mon, 13 May 2024 08:35:14 +0000 (11:35 +0300)
convert-hf-to-gguf-update.py		patch \| blob \| history
convert-hf-to-gguf.py		patch \| blob \| history
llama.cpp		patch \| blob \| history