vocab : add midm-2.0 model pre-tokenizer (#14626)

author Dowon <redacted>

Fri, 11 Jul 2025 07:36:04 +0000 (16:36 +0900)

committer GitHub <redacted>

Fri, 11 Jul 2025 07:36:04 +0000 (09:36 +0200)
author Dowon <redacted>
Fri, 11 Jul 2025 07:36:04 +0000 (16:36 +0900)
committer GitHub <redacted>
Fri, 11 Jul 2025 07:36:04 +0000 (09:36 +0200)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index 52aa87d6a9952f5316b60b022435d846a2fea881..3d5e7e5a456004938c56882bf520589b72d3f6ba 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -833,6 +833,9 @@ class TextModel(ModelBase):
          if chkhsh == "48f8e02c0359c0bbdd82f26909171fac1c18a457bb47573ed1fe3bbb2c1cfd4b":
              # ref: https://huggingface.co/tiiuae/Falcon-H1-34B-Base
              res = "falcon-h1"
+        if chkhsh == "f6791d196f87ce6b56a7d234be618e0d58f8cda3549416635b2bebcd22cd95c4":
+            # ref: https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct
+            res = "midm-2.0"
  
          if res is None:
              logger.warning("\n")
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py

index b8cb6027d6de55f41c9b2ac369483b3c0b2f3c93..9f9b88da8785aa00b4f9d1c8edaa0f434ea83d1a 100755 (executable)
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -129,6 +129,7 @@ models = [
      {"name": "pixtral",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/mistral-community/pixtral-12b", },
      {"name": "seed-coder",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ByteDance-Seed/Seed-Coder-8B-Base", },
      {"name": "a.x-4.0",          "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/skt/A.X-4.0", },
+    {"name": "midm-2.0",         "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/K-intelligence/Midm-2.0-Base-Instruct", },
  ]
  
  # some models are known to be broken upstream, so we will skip them as exceptions
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp

index 10823b183da885cae1ddc561103a23d0fce0490e..02cdc244adbef533b42a76de39beb352e6ebd09a 100644 (file)
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1524,7 +1524,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                      tokenizer_pre == "llama-bpe"||
                      tokenizer_pre == "falcon3"  ||
                      tokenizer_pre == "falcon-h1" ||
-                    tokenizer_pre == "pixtral") {
+                    tokenizer_pre == "pixtral"  ||
+                    tokenizer_pre == "midm-2.0") {
                  pre_type = LLAMA_VOCAB_PRE_TYPE_LLAMA3;
                  ignore_merges = true;
                  add_bos = true;
author	Dowon <redacted>
	Fri, 11 Jul 2025 07:36:04 +0000 (16:36 +0900)
committer	GitHub <redacted>
	Fri, 11 Jul 2025 07:36:04 +0000 (09:36 +0200)
convert_hf_to_gguf.py		patch \| blob \| history
convert_hf_to_gguf_update.py		patch \| blob \| history
src/llama-vocab.cpp		patch \| blob \| history