vocab : JetBrains Mellum pre-tokenizer (#15045)

author Csaba Kecskemeti <redacted>

Sun, 3 Aug 2025 19:38:18 +0000 (12:38 -0700)

committer GitHub <redacted>

Sun, 3 Aug 2025 19:38:18 +0000 (21:38 +0200)
author Csaba Kecskemeti <redacted>
Sun, 3 Aug 2025 19:38:18 +0000 (12:38 -0700)
committer GitHub <redacted>
Sun, 3 Aug 2025 19:38:18 +0000 (21:38 +0200)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index 5f15c8257cbef5fb0be1ce4b51f0d195ef7ea621..9303a047694f566052f4eae26c7732a33ee3b0d8 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -852,6 +852,9 @@ class TextModel(ModelBase):
          if chkhsh == "2085e1638f6c377a0aa4ead21b27bb4cb941bf800df86ed391011769c1758dfb":
              # ref: https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B
              res = "exaone4"
+        if chkhsh == "a1e163ecab2e718a4c829d1148b6e86824ec36163bb71941c3dca9cd5ac25756":
+            # ref: https://huggingface.co/JetBrains/Mellum-4b-base
+            res = "mellum"
  
          if res is None:
              logger.warning("\n")
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py

index 211b81ff34088c7ba154c10a36111ec9588ff20e..226805f1e1ff82863774929e327f05fc73664836 100755 (executable)
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -138,6 +138,7 @@ models = [
      {"name": "midm-2.0",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
      {"name": "lfm2",             "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LiquidAI/LFM2-Tokenizer"},
      {"name": "exaone4",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
+    {"name": "mellum",           "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
  ]
  
  # some models are known to be broken upstream, so we will skip them as exceptions
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp

index 7b7a93566027ac3cb1a849f75eab38de78d5f994..959c86a14745fe5c02873a74a9d488379e3053f4 100644 (file)
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1856,7 +1856,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                      tokenizer_pre == "gigachat"   ||
                      tokenizer_pre == "jina-v2-es" ||
                      tokenizer_pre == "jina-v2-de" ||
-                    tokenizer_pre == "a.x-4.0") {
+                    tokenizer_pre == "a.x-4.0" ||
+                    tokenizer_pre == "mellum") {
                  pre_type = LLAMA_VOCAB_PRE_TYPE_GPT2;
              } else if (
                      tokenizer_pre == "jina-v1-en" ||
author	Csaba Kecskemeti <redacted>
	Sun, 3 Aug 2025 19:38:18 +0000 (12:38 -0700)
committer	GitHub <redacted>
	Sun, 3 Aug 2025 19:38:18 +0000 (21:38 +0200)
convert_hf_to_gguf.py		patch \| blob \| history
convert_hf_to_gguf_update.py		patch \| blob \| history
src/llama-vocab.cpp		patch \| blob \| history