model : add HunyuanOCR support (#21395)

author Richard Davison <redacted>

Sun, 5 Apr 2026 21:32:14 +0000 (23:32 +0200)

committer GitHub <redacted>

Sun, 5 Apr 2026 21:32:14 +0000 (23:32 +0200)
author Richard Davison <redacted>
Sun, 5 Apr 2026 21:32:14 +0000 (23:32 +0200)
committer GitHub <redacted>
Sun, 5 Apr 2026 21:32:14 +0000 (23:32 +0200)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index d4929d6b6f852894f9313fd69e66239c46984f73..7ba6f6a7425f56b77e1aceded16a47a576abb358 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -11521,13 +11521,50 @@ class LLaDAMoEModel(TextModel):
                  raise ValueError(f"Unprocessed experts: {experts}")
  
  
-@ModelBase.register("HunYuanDenseV1ForCausalLM")
+@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
  class HunYuanModel(TextModel):
      model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
  
+    def _get_eod_token_id(self) -> int | None:
+        """Get the actual end-of-generation token from config (eod_token_id)."""
+        return self.hparams.get("eod_token_id")
+
+    def _get_eot_token_id(self) -> int | None:
+        """Get the end-of-turn token from generation_config.json.
+        This is the first entry in eos_token_id when it's a list."""
+        gen_cfg_path = self.dir_model / "generation_config.json"
+        if gen_cfg_path.is_file():
+            with open(gen_cfg_path, encoding="utf-8") as f:
+                gen_cfg = json.load(f)
+            eos = gen_cfg.get("eos_token_id")
+            if isinstance(eos, list) and len(eos) >= 2:
+                return eos[0]
+        return None
+
+    def _fix_special_tokens(self):
+        """Fix EOS/EOT tokens that are incorrect in upstream configs."""
+        eod_id = self._get_eod_token_id()
+        if eod_id is not None:
+            self.gguf_writer.add_eos_token_id(eod_id)
+        eot_id = self._get_eot_token_id()
+        if eot_id is not None:
+            self.gguf_writer.add_eot_token_id(eot_id)
+
      def set_vocab(self):
          if (self.dir_model / "tokenizer.json").is_file():
-            self._set_vocab_gpt2()
+            tokens, toktypes, tokpre = self.get_vocab_base()
+            self.gguf_writer.add_tokenizer_model("gpt2")
+            self.gguf_writer.add_tokenizer_pre(tokpre)
+            self.gguf_writer.add_token_list(tokens)
+            self.gguf_writer.add_token_types(toktypes)
+
+            # HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab
+            token_types = None
+            if (self.hparams.get("pad_token_id") or 0) < 0:
+                token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask')
+            special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True, special_token_types=token_types)
+            special_vocab.add_to_gguf(self.gguf_writer)
+            self._fix_special_tokens()
          else:
              from transformers import AutoTokenizer
              tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
@@ -11579,13 +11616,18 @@ class HunYuanModel(TextModel):
              # FIX for BOS token: Overwrite incorrect id read from config.json
              if self.hparams['hidden_size'] == 4096:
                  self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token
+            self._fix_special_tokens()
  
      def set_gguf_parameters(self):
+        # HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it
+        saved_num_experts = self.hparams.pop("num_experts", None)
          super().set_gguf_parameters()
+        if saved_num_experts is not None and saved_num_experts > 1:
+            self.hparams["num_experts"] = saved_num_experts
          hparams = self.hparams
  
          # Rope
-        if self.rope_parameters.get("rope_type") == "dynamic":
+        if self.rope_parameters.get("rope_type") in ("dynamic", "xdrope"):
              # HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
              # 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
              alpha = self.rope_parameters.get("alpha", 50)
@@ -11595,13 +11637,14 @@ class HunYuanModel(TextModel):
              self.gguf_writer.add_rope_freq_base(scaled_base)
              self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
              self.gguf_writer.add_rope_scaling_factor(1)
-            # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
-            self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
-            self.gguf_writer.add_context_length(256 * 1024) # 256k context length
+            if self.rope_parameters.get("rope_type") == "dynamic":
+                # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
+                self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
+                self.gguf_writer.add_context_length(256 * 1024) # 256k context length
  
-            # if any of our assumptions about the values are wrong, something has changed and this may need to be updated
-            assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
-                "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
+                # if any of our assumptions about the values are wrong, something has changed and this may need to be updated
+                assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
+                    "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
  
      def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
          if name == "lm_head.weight":
@@ -11609,8 +11652,47 @@ class HunYuanModel(TextModel):
                  logger.info("Skipping tied output layer 'lm_head.weight'")
                  return
  
+        # skip vision tensors for HunyuanVL models
+        if name.startswith("vit."):
+            return
+
+        yield from super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("HunYuanVLForConditionalGeneration")
+class HunyuanOCRVisionModel(MmprojModel):
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.hparams_vision is not None
+        # HunyuanOCR uses max_image_size instead of image_size
+        if "image_size" not in self.hparams_vision:
+            self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
+
+    def set_gguf_parameters(self):
+        super().set_gguf_parameters()
+        assert self.hparams_vision is not None
+        hparams = self.hparams_vision
+        self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
+        self.gguf_writer.add_vision_use_gelu(True)
+        self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
+        self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
+        self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
+        self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
+
+    def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+        if not name.startswith("vit."):
+            return  # skip text tensors
+        # strip CLS token (row 0) from position embeddings so resize_position_embeddings works
+        if "position_embedding" in name:
+            data_torch = data_torch[1:]  # [n_patches+1, n_embd] -> [n_patches, n_embd]
          yield from super().modify_tensors(data_torch, name, bid)
  
+    def tensor_force_quant(self, name, new_name, bid, n_dims):
+        # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
+        if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
+            return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
+        return super().tensor_force_quant(name, new_name, bid, n_dims)
+
  
  @ModelBase.register("SmolLM3ForCausalLM")
  class SmolLM3Model(LlamaModel):
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py

index 3ebd9de5f6ed61194ede4ba653fbadc9fdf4e252..6b1a19a3097ae0a1a89fd650ca0311a642c93219 100644 (file)
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -734,6 +734,7 @@ class MODEL_TENSOR(IntEnum):
      V_LAYER_OUT_SCALE    = auto()
      V_PRE_NORM           = auto()
      V_POST_NORM          = auto()
+    V_MM_PRE_NORM        = auto() # hunyuanocr
      V_MM_POST_NORM       = auto()
      V_MM_INP_NORM        = auto()
      V_MM_INP_PROJ        = auto() # gemma3
@@ -769,6 +770,8 @@ class MODEL_TENSOR(IntEnum):
      V_MM_GATE            = auto() # cogvlm
      V_TOK_BOI            = auto() # cogvlm
      V_TOK_EOI            = auto() # cogvlm
+    V_TOK_IMG_BEGIN      = auto() # hunyuanocr
+    V_TOK_IMG_END        = auto() # hunyuanocr
      V_STD_BIAS           = auto() # gemma4
      V_STD_SCALE          = auto() # gemma4
      V_SAM_POS_EMBD       = auto() # Deepseek-OCR
@@ -1246,6 +1249,9 @@ TENSOR_NAMES: dict[MODEL_TENSOR, str] = {
      MODEL_TENSOR.V_MM_GATE:                 "mm.gate",
      MODEL_TENSOR.V_TOK_BOI:                 "v.boi",
      MODEL_TENSOR.V_TOK_EOI:                 "v.eoi",
+    MODEL_TENSOR.V_MM_PRE_NORM:             "mm.pre_norm",
+    MODEL_TENSOR.V_TOK_IMG_BEGIN:           "mm.image_begin",
+    MODEL_TENSOR.V_TOK_IMG_END:             "mm.image_end",
      MODEL_TENSOR.V_STD_BIAS:                "v.std_bias", # gemma4
      MODEL_TENSOR.V_STD_SCALE:               "v.std_scale", # gemma4
      # DeepSeek-OCR SAM
@@ -1393,6 +1399,9 @@ MODEL_TENSORS: dict[MODEL_ARCH, list[MODEL_TENSOR]] = {
          MODEL_TENSOR.V_MM_GATE,
          MODEL_TENSOR.V_TOK_BOI,
          MODEL_TENSOR.V_TOK_EOI,
+        MODEL_TENSOR.V_MM_PRE_NORM,
+        MODEL_TENSOR.V_TOK_IMG_BEGIN,
+        MODEL_TENSOR.V_TOK_IMG_END,
          MODEL_TENSOR.V_STD_BIAS,
          MODEL_TENSOR.V_STD_SCALE,
          MODEL_TENSOR.V_SAM_POS_EMBD,
@@ -4113,6 +4122,7 @@ class VisionProjectorType:
      GLM4V = "glm4v"
      YOUTUVL = "youtuvl"
      NEMOTRON_V2_VL = "nemotron_v2_vl"
+    HUNYUANOCR     = "hunyuanocr"
  
  
  # Items here are (block size, type size)
diff --git a/gguf-py/gguf/tensor_mapping.py b/gguf-py/gguf/tensor_mapping.py

index a7c7ce4640828c1b75f4e05a8be44f3a928bfbaf..1c324976c3e207da38a1576be1e0ef704fe808ea 100644 (file)
--- a/gguf-py/gguf/tensor_mapping.py
+++ b/gguf-py/gguf/tensor_mapping.py
@@ -1359,6 +1359,7 @@ class TensorNameMap:
              "visual.merger.mlp.{bid}", # qwen2vl
              "mlp_AR.linear_{bid}", # PaddleOCR-VL
              "merger.mlp.{bid}",
+            "vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2)
          ),
  
          MODEL_TENSOR.V_MMPROJ_FC: (
@@ -1366,6 +1367,7 @@ class TensorNameMap:
              "model.vision.linear_proj.linear_proj", # cogvlm
              "model.projector.layers", # Deepseek-OCR
              "visual.merger.proj", # glm4v
+            "vit.perceive.mlp", # HunyuanOCR
          ),
  
          MODEL_TENSOR.V_MMPROJ_MLP: (
@@ -1393,6 +1395,7 @@ class TensorNameMap:
              "model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
              "vpm.embeddings.patch_embedding",
              "model.vision_model.embeddings.patch_embedding", # SmolVLM
+            "vit.embeddings.patch_embedding", # HunyuanOCR
              "vision_tower.patch_conv", # pixtral-hf
              "vision_encoder.patch_conv", # pixtral
              "vision_model.patch_embedding.linear", # llama 4
@@ -1414,6 +1417,7 @@ class TensorNameMap:
              "model.vision_tower.embeddings.position_embeddings", # Intern-S1
              "vpm.embeddings.position_embedding",
              "model.vision_model.embeddings.position_embedding", # SmolVLM
+            "vit.embeddings.position_embedding", # HunyuanOCR
              "vision_model.positional_embedding_vlm", # llama 4
              "vision_tower.patch_embed.pos_emb", # kimi-vl
              "visual.pos_embed", # qwen3vl
@@ -1425,10 +1429,12 @@ class TensorNameMap:
  
          MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
              "model.image_newline",  # Deepseek-OCR
+            "vit.perceive.image_newline", # HunyuanOCR
          ),
  
          MODEL_TENSOR.V_ENC_EMBD_VSEP: (
              "model.view_seperator",  # Deepseek-OCR
+            "vit.perceive.image_sep", # HunyuanOCR
          ),
  
          MODEL_TENSOR.V_ENC_ATTN_QKV: (
@@ -1444,6 +1450,7 @@ class TensorNameMap:
              "model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
              "vpm.encoder.layers.{bid}.self_attn.q_proj",
              "model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
+            "vit.layers.{bid}.self_attn.q_proj", # HunyuanOCR
              "vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
              "vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf
              "vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
@@ -1466,6 +1473,7 @@ class TensorNameMap:
              "model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1
              "vpm.encoder.layers.{bid}.self_attn.k_proj",
              "model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
+            "vit.layers.{bid}.self_attn.k_proj", # HunyuanOCR
              "vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
              "vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf
              "vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
@@ -1488,6 +1496,7 @@ class TensorNameMap:
              "model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1
              "vpm.encoder.layers.{bid}.self_attn.v_proj",
              "model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
+            "vit.layers.{bid}.self_attn.v_proj", # HunyuanOCR
              "vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
              "vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf
              "vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
@@ -1504,6 +1513,7 @@ class TensorNameMap:
              "model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1
              "vpm.encoder.layers.{bid}.layer_norm1",
              "model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
+            "vit.layers.{bid}.input_layernorm", # HunyuanOCR
              "vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
              "vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
              "vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4
@@ -1521,6 +1531,7 @@ class TensorNameMap:
              "model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
              "vpm.encoder.layers.{bid}.self_attn.out_proj",
              "model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
+            "vit.layers.{bid}.self_attn.o_proj", # HunyuanOCR
              "model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
              "vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
              "vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
@@ -1540,6 +1551,7 @@ class TensorNameMap:
              "model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1
              "vpm.encoder.layers.{bid}.layer_norm2",
              "model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
+            "vit.layers.{bid}.post_attention_layernorm", # HunyuanOCR
              "vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
              "vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf
              "vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral
@@ -1557,6 +1569,7 @@ class TensorNameMap:
              "model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
              "vpm.encoder.layers.{bid}.mlp.fc1",
              "model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
+            "vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanOCR
              "vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf
              "vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral
              "vision_model.model.layers.{bid}.mlp.fc1", # llama4
@@ -1583,6 +1596,7 @@ class TensorNameMap:
              "model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
              "vpm.encoder.layers.{bid}.mlp.fc2",
              "model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
+            "vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanOCR
              "vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf
              "vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral
              "vision_model.model.layers.{bid}.mlp.fc2", # llama4
@@ -1639,6 +1653,7 @@ class TensorNameMap:
  
          MODEL_TENSOR.V_MM_POST_NORM: (
              "visual.merger.post_projection_norm", # glm4v
+            "vit.perceive.after_rms", # HunyuanOCR
          ),
  
          MODEL_TENSOR.V_MM_INP_PROJ: (
@@ -1806,6 +1821,18 @@ class TensorNameMap:
              "model.vision.eoi", # cogvlm
          ),
  
+        MODEL_TENSOR.V_MM_PRE_NORM: (
+            "vit.perceive.before_rms", # HunyuanOCR
+        ),
+
+        MODEL_TENSOR.V_TOK_IMG_BEGIN: (
+            "vit.perceive.image_begin", # HunyuanOCR
+        ),
+
+        MODEL_TENSOR.V_TOK_IMG_END: (
+            "vit.perceive.image_end", # HunyuanOCR
+        ),
+
          MODEL_TENSOR.V_STD_BIAS: (
              "model.vision_tower.std_bias", # gemma4
          ),
diff --git a/src/llama-chat.cpp b/src/llama-chat.cpp

index 80a88fadec7e60d9f7c20fa216b922518ee38b68..6554a89b28a6cb3b06d58865e7c112d2c51b50b3 100644 (file)
--- a/src/llama-chat.cpp
+++ b/src/llama-chat.cpp
@@ -73,6 +73,7 @@ static const std::map<std::string, llm_chat_template> LLM_CHAT_TEMPLATES = {
      { "hunyuan-moe",       LLM_CHAT_TEMPLATE_HUNYUAN_MOE       },
      { "gpt-oss",           LLM_CHAT_TEMPLATE_OPENAI_MOE        },
      { "hunyuan-dense",     LLM_CHAT_TEMPLATE_HUNYUAN_DENSE     },
+    { "hunyuan-ocr",       LLM_CHAT_TEMPLATE_HUNYUAN_OCR       },
      { "kimi-k2",           LLM_CHAT_TEMPLATE_KIMI_K2           },
      { "seed_oss",          LLM_CHAT_TEMPLATE_SEED_OSS          },
      { "grok-2",            LLM_CHAT_TEMPLATE_GROK_2            },
@@ -216,6 +217,8 @@ llm_chat_template llm_chat_detect_template(const std::string & tmpl) {
          return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
      } else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
          return LLM_CHAT_TEMPLATE_OPENAI_MOE;
+    } else if (tmpl_contains("<｜hy_Assistant｜>") && tmpl_contains("<｜hy_begin▁of▁sentence｜>")) {
+        return LLM_CHAT_TEMPLATE_HUNYUAN_OCR;
      } else if (tmpl_contains("<｜hy_Assistant｜>") && tmpl_contains("<｜hy_place▁holder▁no▁3｜>")) {
          return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
      } else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
@@ -822,6 +825,22 @@ int32_t llm_chat_apply_template(
                  ss << "<｜hy_User｜>" << chat[i]->content << "<｜hy_Assistant｜>";
              }
          }
+    } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) {
+        // tencent/HunyuanOCR
+        ss << "<｜hy_begin▁of▁sentence｜>";
+        for (size_t i = 0; i < chat.size(); i++) {
+            std::string role(chat[i]->role);
+            if (i == 0 && role == "system") {
+                ss << chat[i]->content << "<｜hy_place▁holder▁no▁3｜>";
+                continue;
+            }
+
+            if (role == "user") {
+                ss << chat[i]->content << "<｜hy_User｜>";
+            } else if (role == "assistant") {
+                ss << chat[i]->content << "<｜hy_Assistant｜>";
+            }
+        }
      } else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
          // moonshotai/Kimi-K2-Instruct
          for (auto message : chat) {
diff --git a/src/llama-chat.h b/src/llama-chat.h

index 2542f3cc865a3220787d2787d010055d503049b2..13f936a946c495668559adb0d6ad4f1228f0a2c1 100644 (file)
--- a/src/llama-chat.h
+++ b/src/llama-chat.h
@@ -53,6 +53,7 @@ enum llm_chat_template {
      LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
      LLM_CHAT_TEMPLATE_OPENAI_MOE,
      LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
+    LLM_CHAT_TEMPLATE_HUNYUAN_OCR,
      LLM_CHAT_TEMPLATE_KIMI_K2,
      LLM_CHAT_TEMPLATE_SEED_OSS,
      LLM_CHAT_TEMPLATE_GROK_2,
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt

index 675464c6b5f96e3e2efd21d40295728e910d8416..6ffdb674deadf03682e563cf5751d89edf1ea66c 100644 (file)
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -19,6 +19,7 @@ add_library(mtmd
              models/conformer.cpp
              models/gemma4v.cpp
              models/glm4v.cpp
+            models/hunyuanocr.cpp
              models/internvl.cpp
              models/kimivl.cpp
              models/kimik25.cpp
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h

index 5fa487367cd19670d3c72e23b48b4f0c920dbb69..1f2f7cfaacd172dd894ecb8933ea734940a96dec 100644 (file)
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -148,6 +148,11 @@
  #define TN_TOK_BOI         "v.boi"
  #define TN_TOK_EOI         "v.eoi"
  
+// hunyuanocr
+#define TN_MM_PRE_NORM     "mm.pre_norm.%s"
+#define TN_TOK_IMG_BEGIN   "mm.image_begin"
+#define TN_TOK_IMG_END     "mm.image_end"
+
  // deepseek-ocr
  #define TN_SAM_POS_EMBD   "v.sam.pos_embd.%s"
  #define TN_SAM_PATCH_EMBD "v.sam.patch_embd.%s"
@@ -266,6 +271,7 @@ enum projector_type {
      PROJECTOR_TYPE_YOUTUVL,
      PROJECTOR_TYPE_KIMIK25,
      PROJECTOR_TYPE_NEMOTRON_V2_VL,
+    PROJECTOR_TYPE_HUNYUANOCR,
      PROJECTOR_TYPE_UNKNOWN,
  };
  
@@ -306,6 +312,7 @@ static std::map<projector_type, std::string> PROJECTOR_TYPE_NAMES = {
      { PROJECTOR_TYPE_YOUTUVL,   "youtuvl"},
      { PROJECTOR_TYPE_KIMIK25,   "kimik25"},
      { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
+    { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
  };
  
  static projector_type clip_projector_type_from_string(const std::string & str) {
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h

index 70270d6e76b3f50e930430ec3e1487de73ab5004..b85c4122ed58c8b1db306490c461f100aba1ebd9 100644 (file)
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -358,7 +358,8 @@ struct clip_model {
      // MINICPMV projection
      ggml_tensor * mm_model_pos_embed_k = nullptr;
      ggml_tensor * mm_model_query = nullptr;
-    ggml_tensor * mm_model_proj = nullptr;
+    ggml_tensor * mm_model_proj   = nullptr;
+    ggml_tensor * mm_model_proj_b = nullptr;
      ggml_tensor * mm_model_kv_proj = nullptr;
      ggml_tensor * mm_model_attn_q_w = nullptr;
      ggml_tensor * mm_model_attn_q_b = nullptr;
@@ -419,6 +420,11 @@ struct clip_model {
      ggml_tensor * mm_boi = nullptr;
      ggml_tensor * mm_eoi = nullptr;
  
+    // hunyuanocr perceiver
+    ggml_tensor * mm_pre_norm_w  = nullptr;
+    ggml_tensor * mm_img_begin   = nullptr;
+    ggml_tensor * mm_img_end     = nullptr;
+
      // deepseek ocr sam
      ggml_tensor * patch_embed_proj_w = nullptr;
      ggml_tensor * patch_embed_proj_b = nullptr;
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp

index 12517123e7c26c09372067e6f21aea9d0e02cd0f..2faf595a9fc709d9428e9eab7f32a1dd75e9e90e 100644 (file)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -902,6 +902,10 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
              {
                  builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
              } break;
+        case PROJECTOR_TYPE_HUNYUANOCR:
+            {
+                builder = std::make_unique<clip_graph_hunyuanocr>(ctx, img);
+            } break;
          case PROJECTOR_TYPE_MLP:
          case PROJECTOR_TYPE_MLP_NORM:
          case PROJECTOR_TYPE_LDP:
@@ -1408,6 +1412,14 @@ struct clip_model_loader {
                          get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
                          get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
                       } break;
+                case PROJECTOR_TYPE_HUNYUANOCR:
+                    {
+                        hparams.n_merge = 2;
+                        get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+                        get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
+                        get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
+                        hparams.set_warmup_n_tokens(28*28);
+                    } break;
                  case PROJECTOR_TYPE_LFM2A:
                      {
                          // audio preprocessing params
@@ -2035,6 +2047,22 @@ struct clip_model_loader {
                      model.mm_boi            = get_tensor(TN_TOK_BOI);
                      model.mm_eoi            = get_tensor(TN_TOK_EOI);
                  } break;
+            case PROJECTOR_TYPE_HUNYUANOCR:
+                {
+                    // proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear)
+                    model.mm_0_w            = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
+                    model.mm_0_b            = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
+                    model.mm_1_w            = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+                    model.mm_1_b            = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+                    model.mm_model_proj     = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
+                    model.mm_model_proj_b   = get_tensor(string_format(TN_MM_PROJECTOR, "bias"));
+                    model.mm_pre_norm_w     = get_tensor(string_format(TN_MM_PRE_NORM, "weight"));
+                    model.mm_post_norm_w    = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
+                    model.mm_img_begin      = get_tensor(TN_TOK_IMG_BEGIN);
+                    model.mm_img_end        = get_tensor(TN_TOK_IMG_END);
+                    model.image_newline     = get_tensor(TN_IMAGE_NEWLINE);
+                    model.view_seperator    = get_tensor(TN_IMAGE_SEPERATOR, false);
+                } break;
              case PROJECTOR_TYPE_JANUS_PRO:
                  {
                      model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
@@ -2584,6 +2612,7 @@ int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 *
          case PROJECTOR_TYPE_QWEN3VL:
          case PROJECTOR_TYPE_GLM4V:
          case PROJECTOR_TYPE_PADDLEOCR:
+        case PROJECTOR_TYPE_HUNYUANOCR:
          case PROJECTOR_TYPE_YOUTUVL:
              return (img->nx / params.patch_size) / 2;
          default:
@@ -2768,6 +2797,13 @@ int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * im
              int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
              n_patches = h * (h + 1) + 1;
          } break;
+        case PROJECTOR_TYPE_HUNYUANOCR:
+            {
+                int merge = ctx->model.hparams.n_merge;
+                int ow = (img->nx / patch_size) / merge;
+                int oh = (img->ny / patch_size) / merge;
+                n_patches = (ow + 1) * oh + 2;
+            } break;
          case PROJECTOR_TYPE_LFM2A:
              {
                  n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
@@ -3175,6 +3211,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
          case PROJECTOR_TYPE_JANUS_PRO:
          case PROJECTOR_TYPE_PHI4:
          case PROJECTOR_TYPE_COGVLM:
+        case PROJECTOR_TYPE_HUNYUANOCR:
              {
                  // do nothing
              } break;
@@ -3346,6 +3383,8 @@ int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
          case PROJECTOR_TYPE_PADDLEOCR:
          case PROJECTOR_TYPE_KIMIK25:
              return ctx->model.mm_2_w->ne[1];
+        case PROJECTOR_TYPE_HUNYUANOCR:
+            return ctx->model.mm_model_proj->ne[1];
          case PROJECTOR_TYPE_COGVLM:
              return ctx->model.mm_4h_to_h_w->ne[1];
          case PROJECTOR_TYPE_DEEPSEEKOCR:
diff --git a/tools/mtmd/models/hunyuanocr.cpp b/tools/mtmd/models/hunyuanocr.cpp

new file mode 100644 (file)

index 0000000..37d1e2b
--- /dev/null
+++ b/tools/mtmd/models/hunyuanocr.cpp
@@ -0,0 +1,59 @@
+#include "models.h"
+
+ggml_cgraph * clip_graph_hunyuanocr::build() {
+    const int merge = hparams.n_merge;
+    const int pw    = n_patches_x;
+    const int ph    = n_patches_y;
+
+    ggml_tensor * pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
+
+    ggml_tensor * inp = build_inp();
+    ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr);
+
+    // perceiver projector
+    cur = build_norm(cur, model.mm_pre_norm_w, nullptr, NORM_TYPE_RMS, eps, -1);
+
+    // [C, W*H] -> [W, H, C] for conv2d
+    cur = ggml_reshape_3d(ctx0, cur, n_embd, pw, ph);
+    cur = ggml_permute(ctx0, cur, 2, 0, 1, 3);
+    cur = ggml_cont(ctx0, cur);
+
+    // Conv2d(1152->2304, k=2, s=2) + GELU + Conv2d(2304->4608, k=1, s=1)
+    cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, merge, merge, 0, 0, 1, 1);
+    if (model.mm_0_b) {
+        cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_0_b, 1, 1, model.mm_0_b->ne[0]));
+    }
+    cur = ggml_gelu(ctx0, cur);
+    cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 1, 1, 0, 0, 1, 1);
+    if (model.mm_1_b) {
+        cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_1_b, 1, 1, model.mm_1_b->ne[0]));
+    }
+
+    const int ow   = pw / merge;
+    const int oh   = ph / merge;
+    const int idim = (int)cur->ne[2]; // OC = 4608
+
+    // append newline along W (dim 0)
+    ggml_tensor * nl = ggml_reshape_4d(ctx0, model.image_newline, 1, 1, idim, 1);
+    nl = ggml_repeat_4d(ctx0, nl, 1, oh, idim, 1);
+    cur = ggml_concat(ctx0, cur, nl, 0);
+
+    // [OW+1, OH, OC] -> [OC, (OW+1)*OH]
+    cur = ggml_permute(ctx0, cur, 1, 2, 0, 3);
+    cur = ggml_cont_2d(ctx0, cur, idim, (ow + 1) * oh);
+
+    // project to LLM hidden size
+    cur = build_mm(model.mm_model_proj, cur);
+    if (model.mm_model_proj_b) {
+        cur = ggml_add(ctx0, cur, model.mm_model_proj_b);
+    }
+
+    // wrap with begin/end tokens
+    cur = ggml_concat(ctx0, ggml_reshape_2d(ctx0, model.mm_img_begin, model.mm_img_begin->ne[0], 1), cur, 1);
+    cur = ggml_concat(ctx0, cur, ggml_reshape_2d(ctx0, model.mm_img_end, model.mm_img_end->ne[0], 1), 1);
+
+    cur = build_norm(cur, model.mm_post_norm_w, nullptr, NORM_TYPE_RMS, eps, -1);
+
+    ggml_build_forward_expand(gf, cur);
+    return gf;
+}
diff --git a/tools/mtmd/models/models.h b/tools/mtmd/models/models.h

index 992eda04bbdbf6b585023edb9e3d002442dcbaa4..6f9632b62abe3bf5b8577ba42129492087c96c70 100644 (file)
--- a/tools/mtmd/models/models.h
+++ b/tools/mtmd/models/models.h
@@ -98,6 +98,11 @@ struct clip_graph_glm4v : clip_graph {
      ggml_cgraph * build() override;
  };
  
+struct clip_graph_hunyuanocr : clip_graph {
+    clip_graph_hunyuanocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+    ggml_cgraph * build() override;
+};
+
  struct clip_graph_mobilenetv5 : clip_graph {
      clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
      ggml_cgraph * build() override;
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp

index 35b4396fd878f0590d87f361cdd97ae080051453..4b6dd44f095d3783f28d8ab1a06f8ceb374a2e28 100644 (file)
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -406,6 +406,13 @@ struct mtmd_context {
                      img_end = "\n"; // prevent empty batch on llama-server
                      image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
                  } break;
+            case PROJECTOR_TYPE_HUNYUANOCR:
+                {
+                    // note: these use fullwidth ｜ (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
+                    img_beg = "<｜hy_place▁holder▁no▁100｜>";
+                    img_end = "<｜hy_place▁holder▁no▁101｜>";
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
+                } break;
              default:
                  throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
          }
author	Richard Davison <redacted>
	Sun, 5 Apr 2026 21:32:14 +0000 (23:32 +0200)
committer	GitHub <redacted>
	Sun, 5 Apr 2026 21:32:14 +0000 (23:32 +0200)
convert_hf_to_gguf.py		patch \| blob \| history
gguf-py/gguf/constants.py		patch \| blob \| history
gguf-py/gguf/tensor_mapping.py		patch \| blob \| history
src/llama-chat.cpp		patch \| blob \| history
src/llama-chat.h		patch \| blob \| history
tools/mtmd/CMakeLists.txt		patch \| blob \| history
tools/mtmd/clip-impl.h		patch \| blob \| history
tools/mtmd/clip-model.h		patch \| blob \| history
tools/mtmd/clip.cpp		patch \| blob \| history
tools/mtmd/models/hunyuanocr.cpp	[new file with mode: 0644]	patch \| blob
tools/mtmd/models/models.h		patch \| blob \| history
tools/mtmd/mtmd.cpp		patch \| blob \| history