raise ValueError(f"Unprocessed experts: {experts}")
-@ModelBase.register("HunYuanDenseV1ForCausalLM")
+@ModelBase.register("HunYuanDenseV1ForCausalLM", "HunYuanVLForConditionalGeneration")
class HunYuanModel(TextModel):
model_arch = gguf.MODEL_ARCH.HUNYUAN_DENSE
+ def _get_eod_token_id(self) -> int | None:
+ """Get the actual end-of-generation token from config (eod_token_id)."""
+ return self.hparams.get("eod_token_id")
+
+ def _get_eot_token_id(self) -> int | None:
+ """Get the end-of-turn token from generation_config.json.
+ This is the first entry in eos_token_id when it's a list."""
+ gen_cfg_path = self.dir_model / "generation_config.json"
+ if gen_cfg_path.is_file():
+ with open(gen_cfg_path, encoding="utf-8") as f:
+ gen_cfg = json.load(f)
+ eos = gen_cfg.get("eos_token_id")
+ if isinstance(eos, list) and len(eos) >= 2:
+ return eos[0]
+ return None
+
+ def _fix_special_tokens(self):
+ """Fix EOS/EOT tokens that are incorrect in upstream configs."""
+ eod_id = self._get_eod_token_id()
+ if eod_id is not None:
+ self.gguf_writer.add_eos_token_id(eod_id)
+ eot_id = self._get_eot_token_id()
+ if eot_id is not None:
+ self.gguf_writer.add_eot_token_id(eot_id)
+
def set_vocab(self):
if (self.dir_model / "tokenizer.json").is_file():
- self._set_vocab_gpt2()
+ tokens, toktypes, tokpre = self.get_vocab_base()
+ self.gguf_writer.add_tokenizer_model("gpt2")
+ self.gguf_writer.add_tokenizer_pre(tokpre)
+ self.gguf_writer.add_token_list(tokens)
+ self.gguf_writer.add_token_types(toktypes)
+
+ # HunyuanOCR has pad_token_id=-1 in config.json; exclude pad from SpecialVocab
+ token_types = None
+ if (self.hparams.get("pad_token_id") or 0) < 0:
+ token_types = ('bos', 'eos', 'unk', 'sep', 'cls', 'mask')
+ special_vocab = gguf.SpecialVocab(self.dir_model, load_merges=True, special_token_types=token_types)
+ special_vocab.add_to_gguf(self.gguf_writer)
+ self._fix_special_tokens()
else:
from transformers import AutoTokenizer
tokenizer = AutoTokenizer.from_pretrained(self.dir_model, trust_remote_code=True)
# FIX for BOS token: Overwrite incorrect id read from config.json
if self.hparams['hidden_size'] == 4096:
self.gguf_writer.add_bos_token_id(127958) # only for 7b dense, fix <|bos|> token
+ self._fix_special_tokens()
def set_gguf_parameters(self):
+ # HunyuanOCR has num_experts=1 which is not MoE, prevent parent from writing it
+ saved_num_experts = self.hparams.pop("num_experts", None)
super().set_gguf_parameters()
+ if saved_num_experts is not None and saved_num_experts > 1:
+ self.hparams["num_experts"] = saved_num_experts
hparams = self.hparams
# Rope
- if self.rope_parameters.get("rope_type") == "dynamic":
+ if self.rope_parameters.get("rope_type") in ("dynamic", "xdrope"):
# HunYuan uses NTK Aware Alpha based scaling. Original implementation: https://www.reddit.com/r/LocalLLaMA/comments/14lz7j5/ntkaware_scaled_rope_allows_llama_models_to_have/
# 1000 corresponds to a usable context length of 256k (https://github.com/Tencent-Hunyuan/Hunyuan-A13B/blob/main/report/Hunyuan_A13B_Technical_Report.pdf)
alpha = self.rope_parameters.get("alpha", 50)
self.gguf_writer.add_rope_freq_base(scaled_base)
self.gguf_writer.add_rope_scaling_type(gguf.RopeScalingType.NONE)
self.gguf_writer.add_rope_scaling_factor(1)
- # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
- self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
- self.gguf_writer.add_context_length(256 * 1024) # 256k context length
+ if self.rope_parameters.get("rope_type") == "dynamic":
+ # There is no consistent way to calculate ctx from alpha, and the config is incorrectly set to 32k
+ self.gguf_writer.add_rope_scaling_orig_ctx_len(256 * 1024) # 256k context length
+ self.gguf_writer.add_context_length(256 * 1024) # 256k context length
- # if any of our assumptions about the values are wrong, something has changed and this may need to be updated
- assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
- "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
+ # if any of our assumptions about the values are wrong, something has changed and this may need to be updated
+ assert base == 10000.0 and self.hparams["max_position_embeddings"] in [32 * 1024, 256 * 1024] , \
+ "HunYuan dynamic RoPE scaling assumptions changed, please update the logic or context length manually"
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
if name == "lm_head.weight":
logger.info("Skipping tied output layer 'lm_head.weight'")
return
+ # skip vision tensors for HunyuanVL models
+ if name.startswith("vit."):
+ return
+
+ yield from super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("HunYuanVLForConditionalGeneration")
+class HunyuanOCRVisionModel(MmprojModel):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ assert self.hparams_vision is not None
+ # HunyuanOCR uses max_image_size instead of image_size
+ if "image_size" not in self.hparams_vision:
+ self.hparams_vision["image_size"] = self.hparams_vision.get("max_image_size", 2048)
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+ assert self.hparams_vision is not None
+ hparams = self.hparams_vision
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.HUNYUANOCR)
+ self.gguf_writer.add_vision_use_gelu(True)
+ self.gguf_writer.add_vision_attention_layernorm_eps(hparams.get("rms_norm_eps", 1e-5))
+ self.gguf_writer.add_vision_spatial_merge_size(hparams.get("spatial_merge_size", 2))
+ self.gguf_writer.add_vision_min_pixels(self.preprocessor_config["min_pixels"])
+ self.gguf_writer.add_vision_max_pixels(self.preprocessor_config["max_pixels"])
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ if not name.startswith("vit."):
+ return # skip text tensors
+ # strip CLS token (row 0) from position embeddings so resize_position_embeddings works
+ if "position_embedding" in name:
+ data_torch = data_torch[1:] # [n_patches+1, n_embd] -> [n_patches, n_embd]
yield from super().modify_tensors(data_torch, name, bid)
+ def tensor_force_quant(self, name, new_name, bid, n_dims):
+ # force conv weights to F32 or F16 to avoid BF16 IM2COL issues on Metal
+ if ("mm.0." in new_name or "mm.2." in new_name) and new_name.endswith(".weight"):
+ return gguf.GGMLQuantizationType.F16 if self.ftype == gguf.LlamaFileType.MOSTLY_F16 else gguf.GGMLQuantizationType.F32
+ return super().tensor_force_quant(name, new_name, bid, n_dims)
+
@ModelBase.register("SmolLM3ForCausalLM")
class SmolLM3Model(LlamaModel):
V_LAYER_OUT_SCALE = auto()
V_PRE_NORM = auto()
V_POST_NORM = auto()
+ V_MM_PRE_NORM = auto() # hunyuanocr
V_MM_POST_NORM = auto()
V_MM_INP_NORM = auto()
V_MM_INP_PROJ = auto() # gemma3
V_MM_GATE = auto() # cogvlm
V_TOK_BOI = auto() # cogvlm
V_TOK_EOI = auto() # cogvlm
+ V_TOK_IMG_BEGIN = auto() # hunyuanocr
+ V_TOK_IMG_END = auto() # hunyuanocr
V_STD_BIAS = auto() # gemma4
V_STD_SCALE = auto() # gemma4
V_SAM_POS_EMBD = auto() # Deepseek-OCR
MODEL_TENSOR.V_MM_GATE: "mm.gate",
MODEL_TENSOR.V_TOK_BOI: "v.boi",
MODEL_TENSOR.V_TOK_EOI: "v.eoi",
+ MODEL_TENSOR.V_MM_PRE_NORM: "mm.pre_norm",
+ MODEL_TENSOR.V_TOK_IMG_BEGIN: "mm.image_begin",
+ MODEL_TENSOR.V_TOK_IMG_END: "mm.image_end",
MODEL_TENSOR.V_STD_BIAS: "v.std_bias", # gemma4
MODEL_TENSOR.V_STD_SCALE: "v.std_scale", # gemma4
# DeepSeek-OCR SAM
MODEL_TENSOR.V_MM_GATE,
MODEL_TENSOR.V_TOK_BOI,
MODEL_TENSOR.V_TOK_EOI,
+ MODEL_TENSOR.V_MM_PRE_NORM,
+ MODEL_TENSOR.V_TOK_IMG_BEGIN,
+ MODEL_TENSOR.V_TOK_IMG_END,
MODEL_TENSOR.V_STD_BIAS,
MODEL_TENSOR.V_STD_SCALE,
MODEL_TENSOR.V_SAM_POS_EMBD,
GLM4V = "glm4v"
YOUTUVL = "youtuvl"
NEMOTRON_V2_VL = "nemotron_v2_vl"
+ HUNYUANOCR = "hunyuanocr"
# Items here are (block size, type size)
"visual.merger.mlp.{bid}", # qwen2vl
"mlp_AR.linear_{bid}", # PaddleOCR-VL
"merger.mlp.{bid}",
+ "vit.perceive.proj.{bid}", # HunyuanOCR (proj.0 = conv1, proj.2 = conv2)
),
MODEL_TENSOR.V_MMPROJ_FC: (
"model.vision.linear_proj.linear_proj", # cogvlm
"model.projector.layers", # Deepseek-OCR
"visual.merger.proj", # glm4v
+ "vit.perceive.mlp", # HunyuanOCR
),
MODEL_TENSOR.V_MMPROJ_MLP: (
"model.vision_tower.embeddings.patch_embeddings.projection", # Intern-S1
"vpm.embeddings.patch_embedding",
"model.vision_model.embeddings.patch_embedding", # SmolVLM
+ "vit.embeddings.patch_embedding", # HunyuanOCR
"vision_tower.patch_conv", # pixtral-hf
"vision_encoder.patch_conv", # pixtral
"vision_model.patch_embedding.linear", # llama 4
"model.vision_tower.embeddings.position_embeddings", # Intern-S1
"vpm.embeddings.position_embedding",
"model.vision_model.embeddings.position_embedding", # SmolVLM
+ "vit.embeddings.position_embedding", # HunyuanOCR
"vision_model.positional_embedding_vlm", # llama 4
"vision_tower.patch_embed.pos_emb", # kimi-vl
"visual.pos_embed", # qwen3vl
MODEL_TENSOR.V_ENC_EMBD_IMGNL: (
"model.image_newline", # Deepseek-OCR
+ "vit.perceive.image_newline", # HunyuanOCR
),
MODEL_TENSOR.V_ENC_EMBD_VSEP: (
"model.view_seperator", # Deepseek-OCR
+ "vit.perceive.image_sep", # HunyuanOCR
),
MODEL_TENSOR.V_ENC_ATTN_QKV: (
"model.vision_tower.encoder.layer.{bid}.attention.q_proj", # Intern-S1
"vpm.encoder.layers.{bid}.self_attn.q_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.q_proj", # SmolVLM
+ "vit.layers.{bid}.self_attn.q_proj", # HunyuanOCR
"vision_model.model.layers.{bid}.self_attn.q_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.q_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention.wq", # pixtral
"model.vision_tower.encoder.layer.{bid}.attention.k_proj", # Intern-S1
"vpm.encoder.layers.{bid}.self_attn.k_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.k_proj", # SmolVLM
+ "vit.layers.{bid}.self_attn.k_proj", # HunyuanOCR
"vision_model.model.layers.{bid}.self_attn.k_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.k_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention.wk", # pixtral
"model.vision_tower.encoder.layer.{bid}.attention.v_proj", # Intern-S1
"vpm.encoder.layers.{bid}.self_attn.v_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.v_proj", # SmolVLM
+ "vit.layers.{bid}.self_attn.v_proj", # HunyuanOCR
"vision_model.model.layers.{bid}.self_attn.v_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.v_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention.wv", # pixtral
"model.vision_tower.encoder.layer.{bid}.layernorm_before", # Intern-S1
"vpm.encoder.layers.{bid}.layer_norm1",
"model.vision_model.encoder.layers.{bid}.layer_norm1", # SmolVLM
+ "vit.layers.{bid}.input_layernorm", # HunyuanOCR
"vision_tower.transformer.layers.{bid}.attention_norm", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention_norm", # pixtral
"vision_model.model.layers.{bid}.input_layernorm", # llama4, gemma4
"model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
"vpm.encoder.layers.{bid}.self_attn.out_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
+ "vit.layers.{bid}.self_attn.o_proj", # HunyuanOCR
"model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
"model.vision_tower.encoder.layer.{bid}.layernorm_after", # Intern-S1
"vpm.encoder.layers.{bid}.layer_norm2",
"model.vision_model.encoder.layers.{bid}.layer_norm2", # SmolVLM
+ "vit.layers.{bid}.post_attention_layernorm", # HunyuanOCR
"vision_model.model.layers.{bid}.post_attention_layernorm", # llama4
"vision_tower.transformer.layers.{bid}.ffn_norm", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.ffn_norm", # pixtral
"model.vision_tower.encoder.layer.{bid}.mlp.fc1", # Intern-S1
"vpm.encoder.layers.{bid}.mlp.fc1",
"model.vision_model.encoder.layers.{bid}.mlp.fc1", # SmolVLM, gemma3
+ "vit.layers.{bid}.mlp.dense_h_to_4h", # HunyuanOCR
"vision_tower.transformer.layers.{bid}.feed_forward.up_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.feed_forward.w3", # pixtral
"vision_model.model.layers.{bid}.mlp.fc1", # llama4
"model.vision_tower.encoder.layer.{bid}.mlp.fc2", # Intern-S1
"vpm.encoder.layers.{bid}.mlp.fc2",
"model.vision_model.encoder.layers.{bid}.mlp.fc2", # SmolVLM, gemma3
+ "vit.layers.{bid}.mlp.dense_4h_to_h", # HunyuanOCR
"vision_tower.transformer.layers.{bid}.feed_forward.down_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.feed_forward.w2", # pixtral
"vision_model.model.layers.{bid}.mlp.fc2", # llama4
MODEL_TENSOR.V_MM_POST_NORM: (
"visual.merger.post_projection_norm", # glm4v
+ "vit.perceive.after_rms", # HunyuanOCR
),
MODEL_TENSOR.V_MM_INP_PROJ: (
"model.vision.eoi", # cogvlm
),
+ MODEL_TENSOR.V_MM_PRE_NORM: (
+ "vit.perceive.before_rms", # HunyuanOCR
+ ),
+
+ MODEL_TENSOR.V_TOK_IMG_BEGIN: (
+ "vit.perceive.image_begin", # HunyuanOCR
+ ),
+
+ MODEL_TENSOR.V_TOK_IMG_END: (
+ "vit.perceive.image_end", # HunyuanOCR
+ ),
+
MODEL_TENSOR.V_STD_BIAS: (
"model.vision_tower.std_bias", # gemma4
),
{ "hunyuan-moe", LLM_CHAT_TEMPLATE_HUNYUAN_MOE },
{ "gpt-oss", LLM_CHAT_TEMPLATE_OPENAI_MOE },
{ "hunyuan-dense", LLM_CHAT_TEMPLATE_HUNYUAN_DENSE },
+ { "hunyuan-ocr", LLM_CHAT_TEMPLATE_HUNYUAN_OCR },
{ "kimi-k2", LLM_CHAT_TEMPLATE_KIMI_K2 },
{ "seed_oss", LLM_CHAT_TEMPLATE_SEED_OSS },
{ "grok-2", LLM_CHAT_TEMPLATE_GROK_2 },
return LLM_CHAT_TEMPLATE_HUNYUAN_MOE;
} else if (tmpl_contains("<|start|>") && tmpl_contains("<|channel|>")) {
return LLM_CHAT_TEMPLATE_OPENAI_MOE;
+ } else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_begin▁of▁sentence|>")) {
+ return LLM_CHAT_TEMPLATE_HUNYUAN_OCR;
} else if (tmpl_contains("<|hy_Assistant|>") && tmpl_contains("<|hy_place▁holder▁no▁3|>")) {
return LLM_CHAT_TEMPLATE_HUNYUAN_DENSE;
} else if (tmpl_contains("<|im_assistant|>assistant<|im_middle|>")) {
ss << "<|hy_User|>" << chat[i]->content << "<|hy_Assistant|>";
}
}
+ } else if (tmpl == LLM_CHAT_TEMPLATE_HUNYUAN_OCR) {
+ // tencent/HunyuanOCR
+ ss << "<|hy_begin▁of▁sentence|>";
+ for (size_t i = 0; i < chat.size(); i++) {
+ std::string role(chat[i]->role);
+ if (i == 0 && role == "system") {
+ ss << chat[i]->content << "<|hy_place▁holder▁no▁3|>";
+ continue;
+ }
+
+ if (role == "user") {
+ ss << chat[i]->content << "<|hy_User|>";
+ } else if (role == "assistant") {
+ ss << chat[i]->content << "<|hy_Assistant|>";
+ }
+ }
} else if (tmpl == LLM_CHAT_TEMPLATE_KIMI_K2) {
// moonshotai/Kimi-K2-Instruct
for (auto message : chat) {
LLM_CHAT_TEMPLATE_HUNYUAN_MOE,
LLM_CHAT_TEMPLATE_OPENAI_MOE,
LLM_CHAT_TEMPLATE_HUNYUAN_DENSE,
+ LLM_CHAT_TEMPLATE_HUNYUAN_OCR,
LLM_CHAT_TEMPLATE_KIMI_K2,
LLM_CHAT_TEMPLATE_SEED_OSS,
LLM_CHAT_TEMPLATE_GROK_2,
models/conformer.cpp
models/gemma4v.cpp
models/glm4v.cpp
+ models/hunyuanocr.cpp
models/internvl.cpp
models/kimivl.cpp
models/kimik25.cpp
#define TN_TOK_BOI "v.boi"
#define TN_TOK_EOI "v.eoi"
+// hunyuanocr
+#define TN_MM_PRE_NORM "mm.pre_norm.%s"
+#define TN_TOK_IMG_BEGIN "mm.image_begin"
+#define TN_TOK_IMG_END "mm.image_end"
+
// deepseek-ocr
#define TN_SAM_POS_EMBD "v.sam.pos_embd.%s"
#define TN_SAM_PATCH_EMBD "v.sam.patch_embd.%s"
PROJECTOR_TYPE_YOUTUVL,
PROJECTOR_TYPE_KIMIK25,
PROJECTOR_TYPE_NEMOTRON_V2_VL,
+ PROJECTOR_TYPE_HUNYUANOCR,
PROJECTOR_TYPE_UNKNOWN,
};
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
{ PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
+ { PROJECTOR_TYPE_HUNYUANOCR, "hunyuanocr"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {
// MINICPMV projection
ggml_tensor * mm_model_pos_embed_k = nullptr;
ggml_tensor * mm_model_query = nullptr;
- ggml_tensor * mm_model_proj = nullptr;
+ ggml_tensor * mm_model_proj = nullptr;
+ ggml_tensor * mm_model_proj_b = nullptr;
ggml_tensor * mm_model_kv_proj = nullptr;
ggml_tensor * mm_model_attn_q_w = nullptr;
ggml_tensor * mm_model_attn_q_b = nullptr;
ggml_tensor * mm_boi = nullptr;
ggml_tensor * mm_eoi = nullptr;
+ // hunyuanocr perceiver
+ ggml_tensor * mm_pre_norm_w = nullptr;
+ ggml_tensor * mm_img_begin = nullptr;
+ ggml_tensor * mm_img_end = nullptr;
+
// deepseek ocr sam
ggml_tensor * patch_embed_proj_w = nullptr;
ggml_tensor * patch_embed_proj_b = nullptr;
{
builder = std::make_unique<clip_graph_cogvlm>(ctx, img);
} break;
+ case PROJECTOR_TYPE_HUNYUANOCR:
+ {
+ builder = std::make_unique<clip_graph_hunyuanocr>(ctx, img);
+ } break;
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM:
case PROJECTOR_TYPE_LDP:
get_u32(KEY_SAM_N_EMBD, hparams.sam_n_embd, true);
get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
} break;
+ case PROJECTOR_TYPE_HUNYUANOCR:
+ {
+ hparams.n_merge = 2;
+ get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
+ get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
+ get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
+ hparams.set_warmup_n_tokens(28*28);
+ } break;
case PROJECTOR_TYPE_LFM2A:
{
// audio preprocessing params
model.mm_boi = get_tensor(TN_TOK_BOI);
model.mm_eoi = get_tensor(TN_TOK_EOI);
} break;
+ case PROJECTOR_TYPE_HUNYUANOCR:
+ {
+ // proj.0 -> mm.0 (conv1), proj.2 -> mm.2 (conv2), mlp -> mm.model.fc (linear)
+ model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
+ model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 2, "weight"));
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 2, "bias"));
+ model.mm_model_proj = get_tensor(string_format(TN_MM_PROJECTOR, "weight"));
+ model.mm_model_proj_b = get_tensor(string_format(TN_MM_PROJECTOR, "bias"));
+ model.mm_pre_norm_w = get_tensor(string_format(TN_MM_PRE_NORM, "weight"));
+ model.mm_post_norm_w = get_tensor(string_format(TN_MM_POST_NORM, "weight"));
+ model.mm_img_begin = get_tensor(TN_TOK_IMG_BEGIN);
+ model.mm_img_end = get_tensor(TN_TOK_IMG_END);
+ model.image_newline = get_tensor(TN_IMAGE_NEWLINE);
+ model.view_seperator = get_tensor(TN_IMAGE_SEPERATOR, false);
+ } break;
case PROJECTOR_TYPE_JANUS_PRO:
{
model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
case PROJECTOR_TYPE_QWEN3VL:
case PROJECTOR_TYPE_GLM4V:
case PROJECTOR_TYPE_PADDLEOCR:
+ case PROJECTOR_TYPE_HUNYUANOCR:
case PROJECTOR_TYPE_YOUTUVL:
return (img->nx / params.patch_size) / 2;
default:
int h = static_cast<int>(std::sqrt(static_cast<float>(n_patches)));
n_patches = h * (h + 1) + 1;
} break;
+ case PROJECTOR_TYPE_HUNYUANOCR:
+ {
+ int merge = ctx->model.hparams.n_merge;
+ int ow = (img->nx / patch_size) / merge;
+ int oh = (img->ny / patch_size) / merge;
+ n_patches = (ow + 1) * oh + 2;
+ } break;
case PROJECTOR_TYPE_LFM2A:
{
n_patches = ((((img->nx + 1) / 2) + 1) / 2 + 1) / 2;
case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_PHI4:
case PROJECTOR_TYPE_COGVLM:
+ case PROJECTOR_TYPE_HUNYUANOCR:
{
// do nothing
} break;
case PROJECTOR_TYPE_PADDLEOCR:
case PROJECTOR_TYPE_KIMIK25:
return ctx->model.mm_2_w->ne[1];
+ case PROJECTOR_TYPE_HUNYUANOCR:
+ return ctx->model.mm_model_proj->ne[1];
case PROJECTOR_TYPE_COGVLM:
return ctx->model.mm_4h_to_h_w->ne[1];
case PROJECTOR_TYPE_DEEPSEEKOCR:
--- /dev/null
+#include "models.h"
+
+ggml_cgraph * clip_graph_hunyuanocr::build() {
+ const int merge = hparams.n_merge;
+ const int pw = n_patches_x;
+ const int ph = n_patches_y;
+
+ ggml_tensor * pos_embd = resize_position_embeddings(GGML_SCALE_MODE_BILINEAR);
+
+ ggml_tensor * inp = build_inp();
+ ggml_tensor * cur = build_vit(inp, n_patches, NORM_TYPE_NORMAL, hparams.ffn_op, pos_embd, nullptr);
+
+ // perceiver projector
+ cur = build_norm(cur, model.mm_pre_norm_w, nullptr, NORM_TYPE_RMS, eps, -1);
+
+ // [C, W*H] -> [W, H, C] for conv2d
+ cur = ggml_reshape_3d(ctx0, cur, n_embd, pw, ph);
+ cur = ggml_permute(ctx0, cur, 2, 0, 1, 3);
+ cur = ggml_cont(ctx0, cur);
+
+ // Conv2d(1152->2304, k=2, s=2) + GELU + Conv2d(2304->4608, k=1, s=1)
+ cur = ggml_conv_2d(ctx0, model.mm_0_w, cur, merge, merge, 0, 0, 1, 1);
+ if (model.mm_0_b) {
+ cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_0_b, 1, 1, model.mm_0_b->ne[0]));
+ }
+ cur = ggml_gelu(ctx0, cur);
+ cur = ggml_conv_2d(ctx0, model.mm_1_w, cur, 1, 1, 0, 0, 1, 1);
+ if (model.mm_1_b) {
+ cur = ggml_add(ctx0, cur, ggml_reshape_3d(ctx0, model.mm_1_b, 1, 1, model.mm_1_b->ne[0]));
+ }
+
+ const int ow = pw / merge;
+ const int oh = ph / merge;
+ const int idim = (int)cur->ne[2]; // OC = 4608
+
+ // append newline along W (dim 0)
+ ggml_tensor * nl = ggml_reshape_4d(ctx0, model.image_newline, 1, 1, idim, 1);
+ nl = ggml_repeat_4d(ctx0, nl, 1, oh, idim, 1);
+ cur = ggml_concat(ctx0, cur, nl, 0);
+
+ // [OW+1, OH, OC] -> [OC, (OW+1)*OH]
+ cur = ggml_permute(ctx0, cur, 1, 2, 0, 3);
+ cur = ggml_cont_2d(ctx0, cur, idim, (ow + 1) * oh);
+
+ // project to LLM hidden size
+ cur = build_mm(model.mm_model_proj, cur);
+ if (model.mm_model_proj_b) {
+ cur = ggml_add(ctx0, cur, model.mm_model_proj_b);
+ }
+
+ // wrap with begin/end tokens
+ cur = ggml_concat(ctx0, ggml_reshape_2d(ctx0, model.mm_img_begin, model.mm_img_begin->ne[0], 1), cur, 1);
+ cur = ggml_concat(ctx0, cur, ggml_reshape_2d(ctx0, model.mm_img_end, model.mm_img_end->ne[0], 1), 1);
+
+ cur = build_norm(cur, model.mm_post_norm_w, nullptr, NORM_TYPE_RMS, eps, -1);
+
+ ggml_build_forward_expand(gf, cur);
+ return gf;
+}
ggml_cgraph * build() override;
};
+struct clip_graph_hunyuanocr : clip_graph {
+ clip_graph_hunyuanocr(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
struct clip_graph_mobilenetv5 : clip_graph {
clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
img_end = "\n"; // prevent empty batch on llama-server
image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
} break;
+ case PROJECTOR_TYPE_HUNYUANOCR:
+ {
+ // note: these use fullwidth | (U+FF5C) and ▁ (U+2581) to match the tokenizer vocabulary
+ img_beg = "<|hy_place▁holder▁no▁100|>";
+ img_end = "<|hy_place▁holder▁no▁101|>";
+ image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
+ } break;
default:
throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
}