model : add KORMo model (#18032)

author HelloKS <redacted>

Mon, 15 Dec 2025 17:51:43 +0000 (02:51 +0900)

committer GitHub <redacted>

Mon, 15 Dec 2025 17:51:43 +0000 (18:51 +0100)
author HelloKS <redacted>
Mon, 15 Dec 2025 17:51:43 +0000 (02:51 +0900)
committer GitHub <redacted>
Mon, 15 Dec 2025 17:51:43 +0000 (18:51 +0100)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index ee02cdd91c6afee17681558672f0b16942241c63..ad20cb8b88c5425ae1c3f1ad57fed2eedc0c0d5c 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -1203,6 +1203,9 @@ class TextModel(ModelBase):
          if chkhsh == "f4f37b6c8eb9ea29b3eac6bb8c8487c5ab7885f8d8022e67edc1c68ce8403e95":
              # ref: https://huggingface.co/MiniMaxAI/MiniMax-M2
              res = "minimax-m2"
+        if chkhsh == "4a2e2abae11ca2b86d570fc5b44be4d5eb5e72cc8f22dd136a94b37da83ab665":
+            # ref: https://huggingface.co/KORMo-Team/KORMo-tokenizer
+            res = "kormo"
  
          if res is None:
              logger.warning("\n")
@@ -3398,7 +3401,7 @@ class QwenModel(TextModel):
          self._set_vocab_qwen()
  
  
-@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration")
+@ModelBase.register("Qwen2Model", "Qwen2ForCausalLM", "Qwen2AudioForConditionalGeneration", "KORMoForCausalLM")
  class Qwen2Model(TextModel):
      model_arch = gguf.MODEL_ARCH.QWEN2
  
diff --git a/convert_hf_to_gguf_update.py b/convert_hf_to_gguf_update.py

index b8f694e86c06221c390dc9cebe32fad68a8b3a5f..5e8456a7ea51bba5dae43e010b6125bbc76769e1 100755 (executable)
--- a/convert_hf_to_gguf_update.py
+++ b/convert_hf_to_gguf_update.py
@@ -143,6 +143,7 @@ models = [
      {"name": "bailingmoe2",      "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/Ling-mini-base-2.0", },
      {"name": "granite-docling",  "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
      {"name": "minimax-m2",       "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/MiniMaxAI/MiniMax-M2", },
+    {"name": "kormo",            "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/KORMo-Team/KORMo-tokenizer", },
  ]
  
  # some models are known to be broken upstream, so we will skip them as exceptions
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py

index d9c87da19469ca06cde6069940d28f6611ca43ec..b320e2b4b2cc1fad2d6279b3e6e18d87044bcdac 100644 (file)
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -154,7 +154,8 @@ class TensorNameMap:
              "model.layers.{bid}.operator_norm",                     # lfm2
              "model.transformer.blocks.{bid}.attn_norm",             # llada
              "layers.{bid}.input_layernorm",                         # qwen3-embedding
-            "model.layers.{bid}.attention_layernorm"                # apertus
+            "model.layers.{bid}.attention_layernorm",               # apertus
+            "model.layers.{bid}.pre_attention_layernorm",           # kormo
          ),
  
          # Attention norm 2
@@ -342,6 +343,7 @@ class TensorNameMap:
              "model.transformer.blocks.{bid}.ff_norm",                        # llada
              "layers.{bid}.post_attention_layernorm",                         # qwen3-embedding
              "model.layers.{bid}.feedforward_layernorm",                      # apertus
+            "model.layers.{bid}.pre_mlp_layernorm",                          # kormo
          ),
  
          # Pre feed-forward norm
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index e4d2138056c4008307a6c23c829005447535553f..050735afc0ff44a91c401da853a11a707ffd7a22 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -3388,9 +3388,9 @@ bool llama_model::load_tensors(llama_model_loader & ml) {
                          layer.wo = create_tensor(tn(LLM_TENSOR_ATTN_OUT, "weight", i), {n_embd, n_embd}, 0);
  
                          // optional bias tensors
-                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, 0);
-                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, 0);
-                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, 0);
+                        layer.bq = create_tensor(tn(LLM_TENSOR_ATTN_Q,   "bias", i), {n_embd}, TENSOR_NOT_REQUIRED);
+                        layer.bk = create_tensor(tn(LLM_TENSOR_ATTN_K,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
+                        layer.bv = create_tensor(tn(LLM_TENSOR_ATTN_V,   "bias", i), {n_embd_gqa}, TENSOR_NOT_REQUIRED);
  
                          layer.ffn_norm = create_tensor(tn(LLM_TENSOR_FFN_NORM, "weight", i), {n_embd}, 0);
  
diff --git a/src/llama-vocab.cpp b/src/llama-vocab.cpp

index e2cca66e48fe7f75620f1528d030f222ba6946b6..7b01a2edfe1f21cd09d603829666d6fa13efe54e 100644 (file)
--- a/src/llama-vocab.cpp
+++ b/src/llama-vocab.cpp
@@ -1895,7 +1895,8 @@ void llama_vocab::impl::load(llama_model_loader & ml, const LLM_KV & kv) {
                  clean_spaces = false;
              } else if (
                      tokenizer_pre == "qwen2" ||
-                    tokenizer_pre == "deepseek-r1-qwen") {
+                    tokenizer_pre == "deepseek-r1-qwen" ||
+                    tokenizer_pre == "kormo") {
                  pre_type = LLAMA_VOCAB_PRE_TYPE_QWEN2;
                  clean_spaces = false;
              } else if (
diff --git a/src/models/qwen2.cpp b/src/models/qwen2.cpp

index 587a932426fb846ec94a7e135e15040e880900bd..3da4dea3c16752b35040398d6fd6d5b293b9e928 100644 (file)
--- a/src/models/qwen2.cpp
+++ b/src/models/qwen2.cpp
@@ -31,16 +31,25 @@ llm_build_qwen2::llm_build_qwen2(const llama_model & model, const llm_graph_para
          {
              // compute Q and K and RoPE them
              ggml_tensor * Qcur = build_lora_mm(model.layers[il].wq, cur);
-            Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
              cb(Qcur, "Qcur", il);
+            if (model.layers[il].bq) {
+                Qcur = ggml_add(ctx0, Qcur, model.layers[il].bq);
+                cb(Qcur, "Qcur", il);
+            }
  
              ggml_tensor * Kcur = build_lora_mm(model.layers[il].wk, cur);
-            Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
              cb(Kcur, "Kcur", il);
+            if (model.layers[il].bk) {
+                Kcur = ggml_add(ctx0, Kcur, model.layers[il].bk);
+                cb(Kcur, "Kcur", il);
+            }
  
              ggml_tensor * Vcur = build_lora_mm(model.layers[il].wv, cur);
-            Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
              cb(Vcur, "Vcur", il);
+            if (model.layers[il].bv) {
+                Vcur = ggml_add(ctx0, Vcur, model.layers[il].bv);
+                cb(Vcur, "Vcur", il);
+            }
  
              Qcur = ggml_reshape_3d(ctx0, Qcur, n_embd_head, n_head,    n_tokens);
              Kcur = ggml_reshape_3d(ctx0, Kcur, n_embd_head, n_head_kv, n_tokens);
author	HelloKS <redacted>
	Mon, 15 Dec 2025 17:51:43 +0000 (02:51 +0900)
committer	GitHub <redacted>
	Mon, 15 Dec 2025 17:51:43 +0000 (18:51 +0100)
convert_hf_to_gguf.py		patch \| blob \| history
convert_hf_to_gguf_update.py		patch \| blob \| history
gguf-py/gguf/tensor_mapping.py		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history
src/llama-vocab.cpp		patch \| blob \| history
src/models/qwen2.cpp		patch \| blob \| history