* Add Gemma3nVisionModel - MobileNetV5 vision encoder convertor to convert_hf_to_gguf.py. Add gemma3n to vision projectors in gguf-py/gguf/constants.py.
* Add mobilenetv5 impl
* Fix comments, remove unused vars
* Fix permute and remove transpose of projection weights
* Fix comments, remove debugging prints from hf_to_gguf
* 1. Hard-code image_mean = 0 and image_std = 1
2. Use available tensor mapping logic
3. Remove redundant chat template replacement of soft tokens placeholder with media placeholder
* 1. Move mobilenetv5 helpers declarations to `clip_graph_mobilenetv5` struct and definitions to mobilenetv5.cpp
2.Remove unused `clip_is_gemma3n` func declarations and definitions
3. Remove redundant `rescale_image_u8_to_f32` func and use `normalize_image_u8_to_f32` with zero mean and unit std
4. Calculate n_patches using image_size / patch_size
* Remove obsolete comments
* - convert_hf_to_gguf.py & constants.py & tensor_mapping.py: Use explicit mapping: Custom map for double indexed blocks and tensor_mapping.py for rest
- convert_hf_to_gguf.py: Unsqueeze Stem Bias and Layer scale tensors to correct shape while converting to gguf
- mobilenetv5.cpp: Remove explicit reshaping of Stem Bias and Layer scale which are now handled while converting to gguf, replace fprintf with LOG_*
- clip.cpp: Remove unused embedding and hard_emb_norm tensor loading
* - Rename tensors to v.conv..., v.blk..., v.msfa... to better align with already existing terminology
* Fix stem conv bias name
* Remove explicit handling of bias term for stem conv
* - Change order of addition in "project_per_layer_inputs" to support broadcasting of vision inp_per_layer
- Simplify the vision embeddings path of "get_per_layer_inputs" to output [n_embd_altup, n_layer, 1], broadcastable
* clean up conversion script
* fix code style
* also preserve audio tensors
* trailing space
* split arch A and V
* rm unused gemma3 func
* fix alignment
---------
Co-authored-by: Xuan Son Nguyen <redacted>
return ()
def prepare_tensors(self):
- max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
+ # Handle empty tensor_map for models with block_count=0 (like MobileNetV5)
+ if self.tensor_map.mapping:
+ max_name_len = max(len(s) for _, s in self.tensor_map.mapping.values()) + len(".weight,")
+ else:
+ max_name_len = len("vision_encoder.weight,") # Default reasonable length
for name, data_torch in chain(self.generate_extra_tensors(), self.get_tensors()):
# we don't need these
return [] # skip other tensors
+class ConformerAudioModel(MmprojModel):
+ _batch_norm_tensors: list[dict[str, Tensor]] | None = None
+
+ @staticmethod
+ def is_audio_tensor(name: str):
+ return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
+
+ def tensor_force_quant(self, name, new_name, bid, n_dims):
+ if ConformerAudioModel.is_audio_tensor(name):
+ if ".conv" in name or "_conv" in name and ".weight" in name:
+ return gguf.GGMLQuantizationType.F32
+ return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ # fold running_mean, running_var and eps into weight and bias for batch_norm
+ if "batch_norm" in name:
+ if self._batch_norm_tensors is None:
+ self._batch_norm_tensors = [{} for _ in range(self.block_count)]
+ assert bid is not None
+ self._batch_norm_tensors[bid][name] = data_torch
+
+ if len(self._batch_norm_tensors[bid]) < 5:
+ return []
+
+ weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
+ bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
+ running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
+ running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
+ eps = 1e-5 # default value
+
+ a = weight / torch.sqrt(running_var + eps)
+ b = bias - running_mean * a
+ return [
+ (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
+ (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
+ ]
+
+ # reshape conv weights
+ if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
+ data_torch = data_torch[:, None, None]
+ if "conv.depthwise_conv" in name and name.endswith(".weight"):
+ assert data_torch.shape[1] == 1
+ data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
+ if "conv.pointwise_conv" in name and name.endswith(".weight"):
+ assert data_torch.shape[2] == 1
+ data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
+
+ return [(self.map_tensor_name(name), data_torch)]
+
+
@ModelBase.register("Gemma3nForConditionalGeneration")
+class Gemma3nVisionAudioModel(ConformerAudioModel):
+ has_audio_encoder = True
+ has_vision_encoder = True
+
+ # Double indexed mapping for MobileNetV5 blocks (not supported by tensor_mapping.py)
+ # This is the only known model having this, so we prefer implementing it outside of tensor_mapping.py
+ block_tensor_mapping = {
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_exp.weight": "v.blk.{bid}.{sid}.conv_exp.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn1.weight": "v.blk.{bid}.{sid}.bn1.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.conv_pwl.weight": "v.blk.{bid}.{sid}.conv_pwl.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.bn2.weight": "v.blk.{bid}.{sid}.bn2.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.conv.weight": "v.blk.{bid}.{sid}.dw_start.conv.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_start.bn.weight": "v.blk.{bid}.{sid}.dw_start.bn.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.conv.weight": "v.blk.{bid}.{sid}.dw_mid.conv.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.dw_mid.bn.weight": "v.blk.{bid}.{sid}.dw_mid.bn.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.conv.weight": "v.blk.{bid}.{sid}.pw_exp.conv.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_exp.bn.weight": "v.blk.{bid}.{sid}.pw_exp.bn.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.conv.weight": "v.blk.{bid}.{sid}.pw_proj.conv.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.pw_proj.bn.weight": "v.blk.{bid}.{sid}.pw_proj.bn.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.layer_scale.gamma": "v.blk.{bid}.{sid}.layer_scale.gamma",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.query.proj.weight": "v.blk.{bid}.{sid}.attn.query.proj.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.proj.weight": "v.blk.{bid}.{sid}.attn.key.proj.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.proj.weight": "v.blk.{bid}.{sid}.attn.value.proj.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.output.proj.weight": "v.blk.{bid}.{sid}.attn.output.proj.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.down_conv.weight": "v.blk.{bid}.{sid}.attn.key.down_conv.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.key.norm.weight": "v.blk.{bid}.{sid}.attn.key.norm.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.down_conv.weight": "v.blk.{bid}.{sid}.attn.value.down_conv.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.attn.value.norm.weight": "v.blk.{bid}.{sid}.attn.value.norm.weight",
+ "model.vision_tower.timm_model.blocks.{bid}.{sid}.norm.weight": "v.blk.{bid}.{sid}.norm.weight",
+ }
+
+ def __init__(self, *args, **kwargs):
+ # Parent init will call find_hparam which now returns 0 for empty keys
+ super().__init__(*args, **kwargs)
+ assert self.hparams_vision is not None
+ self.hparams_vision["n_layers"] = 128 # fake value for audio encoder, vision encoder doesn't use it
+ self.hparams_vision["intermediate_size"] = self.hparams_vision.get("intermediate_size", 2048) * 4
+ self.hparams_vision["num_attention_heads"] = self.hparams_vision.get("num_attention_heads", 8)
+
+ # MobileNetV5 does not use image_mean/std
+ self.preprocessor_config["image_mean"] = [0.0 ,0.0 , 0.0]
+ self.preprocessor_config["image_std"] = [1.0 ,1.0 ,1.0]
+ self.hparams_vision["image_size"] = self.preprocessor_config.get(
+ "size", {"height": 768, "width": 768}
+ )["height"]
+
+ # Image sequence length (256 tokens = 16x16 for Gemma3n)
+ image_seq_length = self.preprocessor_config.get("image_seq_length", 256)
+ image_size = self.hparams_vision["image_size"]
+ self.hparams_vision["patch_size"] = image_size // image_seq_length
+
+ # remap audio hparams
+ assert self.hparams_audio is not None
+ self.hparams_audio["n_layers"] = self.hparams_audio["conf_num_hidden_layers"]
+ self.hparams_audio["num_attention_heads"] = self.hparams_audio["conf_num_attention_heads"]
+ self.hparams_audio["feat_in"] = self.hparams_audio["input_feat_size"]
+ self.hparams_audio["intermediate_size"] = self.hparams_audio.get("intermediate_size", 6144)
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+
+ # vision params
+ self.gguf_writer.add_clip_vision_projector_type(gguf.VisionProjectorType.GEMMA3NV)
+ self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams.get("layer_norm_eps", 1e-6))
+
+ # audio params
+ assert self.hparams_audio is not None
+ self.gguf_writer.add_clip_audio_projector_type(gguf.VisionProjectorType.GEMMA3NA)
+ self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
+ self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
+
+ def tensor_force_quant(self, name, new_name, bid, n_dims):
+ # Force quantization settings for specific tensor types
+ if "input_projection" in name or "input_proj" in name:
+ return gguf.GGMLQuantizationType.F16
+ if ".embeddings." in name or "stem" in name:
+ return gguf.GGMLQuantizationType.F32
+ return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+ def custom_map(self, name: str) -> str:
+ """Parses names like model.vision_tower.timm_model.blocks.1.2.suffix and applies template mapping."""
+ parts = name.split(".")
+ # MobileNet blocks have at least 7 parts: model, vision_tower, timm_model, blocks, bid, sid, and suffix
+ if len(parts) >= 7:
+ bid, sid = parts[4], parts[5]
+ suffix = ".".join(parts[6:])
+ template = f"model.vision_tower.timm_model.blocks.{{bid}}.{{sid}}.{suffix}"
+ if template in self.block_tensor_mapping:
+ return self.block_tensor_mapping[template].format(bid=bid, sid=sid)
+
+ raise ValueError(f"Unknown name: {name}")
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ if (ConformerAudioModel.is_audio_tensor(name)):
+ name = name.replace("model.audio_tower.conformer.", "conformer.layers.")
+ return super().modify_tensors(data_torch, name, bid)
+
+ # Gemma3n uses
+ # - model.embed_vision.* for projection layers
+ # - model.vision_tower.* for vision encoder
+ # Skip non-vision tensors
+ if not (name.startswith("model.embed_vision.") or name.startswith("model.vision_tower.")):
+ return []
+
+ if name.startswith("model.vision_tower.timm_model.blocks."):
+ # Double-indexed block tensors through custom logic
+ new_name = self.custom_map(name)
+ else:
+ # Route non-repeating (conv_stem, msfa, embedding, etc.) and un-catched through tensor_mapping.py
+ new_name = self.map_tensor_name(name)
+
+ if new_name.endswith("conv_stem.conv.bias") or new_name.endswith("layer_scale.gamma"):
+ data_torch = data_torch.unsqueeze(0).unsqueeze(-1).unsqueeze(-1) # [1, C, 1, 1]
+
+ return [(new_name, data_torch)]
+
+
+@ModelBase.register("Gemma3nForCausalLM", "Gemma3nForConditionalGeneration")
class Gemma3NModel(Gemma3Model):
model_arch = gguf.MODEL_ARCH.GEMMA3N
norm_shift = 0.0 # same value with Gemma3p5RMSNorm scale_shift on python code
]
def set_vocab(self):
+ # For Gemma3n multimodal models, we need the FULL vocab_size (262400)
+ # which includes special tokens from 262144-262399 for vision/audio.
+ # The vocab_size_per_layer_input (262144) is only the embedding size per layer.
+ # Temporarily override the hparams lookup order to prioritize vocab_size.
+
+ # Store original vocab_size_per_layer_input if it exists
+ vocab_size_per_layer_input = self.hparams.get("vocab_size_per_layer_input")
+
+ # Temporarily remove vocab_size_per_layer_input to force using vocab_size
+ if vocab_size_per_layer_input is not None:
+ del self.hparams["vocab_size_per_layer_input"]
+
+ # Call parent set_vocab which will now use vocab_size (262400)
super().set_vocab()
+ # Restore vocab_size_per_layer_input for later use
+ if vocab_size_per_layer_input is not None:
+ self.hparams["vocab_size_per_layer_input"] = vocab_size_per_layer_input
+
def set_gguf_parameters(self):
super().set_gguf_parameters()
self.gguf_writer.add_altup_active_idx(self.hparams["altup_active_idx"])
if "language_model." not in name:
return [] # skip non-language model tensors
+ # Pad token embeddings for vision/audio special tokens (262144-262399)
+ if "embed_tokens.weight" in name or "embed_tokens_per_layer" in name:
+ # Move to CPU to avoid meta device issues during padding
+ data_torch = data_torch.to(device="cpu")
+
+ vocab_size = self.hparams.get("vocab_size", 262400)
+ current_size = data_torch.shape[0] # First dimension is vocab_size
+
+ if current_size < vocab_size:
+ # Pad with zeros for vision/audio tokens (they get embeddings from vision tower)
+ padding_size = vocab_size - current_size
+ tensor_type = "per-layer embeddings" if "per_layer" in name else "token embeddings"
+ logger.info(f"Padding {tensor_type} shape {list(data_torch.shape)} from {current_size} to {vocab_size} (adding {padding_size} vision/audio token slots)")
+
+ # Create padding with zeros (vision tokens won't use these embeddings)
+ padding = torch.zeros((padding_size, data_torch.shape[1]), dtype=data_torch.dtype, device=data_torch.device)
+ data_torch = torch.cat([data_torch, padding], dim=0)
+
+ # Continue with normal processing
+ name = name.replace("language_model.", "")
+ return [(self.map_tensor_name(name), data_torch)]
+
if "altup_unembed_projections" in name:
data_torch = data_torch.to(device="cpu")
+ # altup_unembed matrices are [hidden_size, hidden_size], NOT vocab-based
+ # They should NOT be padded
if ".0." in name:
self._altup_unembd[0] = data_torch
elif ".1." in name:
self._add_feed_forward_length()
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
- if self._is_vision_tensor(name) or self._is_audio_tensor(name):
+ if self._is_vision_tensor(name) or ConformerAudioModel.is_audio_tensor(name):
# skip multimodal tensors
return []
def _is_vision_tensor(self, name: str) -> bool:
return "vision_tower" in name or "multi_modal_projector" in name
- def _is_audio_tensor(self, name: str):
- return any(p in name for p in ["audio", "codebook", "conformer", "depth_embedding", "depthformer", "depth_linear"])
-
@ModelBase.register("Lfm2Model")
class LFM2ColBertModel(LFM2Model):
@ModelBase.register("Lfm2AudioForConditionalGeneration")
-class LFM2AudioModel(MmprojModel):
+class LFM2AudioModel(ConformerAudioModel):
has_vision_encoder = False
has_audio_encoder = True
model_name = "Lfm2AudioEncoder"
- _batch_norm_tensors: list[dict[str, Tensor]] | None = None
-
def get_audio_config(self) -> dict[str, Any] | None:
return self.global_config.get("encoder")
self.gguf_writer.add_audio_num_mel_bins(self.hparams_audio["feat_in"])
self.gguf_writer.add_audio_attention_layernorm_eps(1e-5)
- def tensor_force_quant(self, name, new_name, bid, n_dims):
- if ".conv" in name and ".weight" in name:
- return gguf.GGMLQuantizationType.F32
- return super().tensor_force_quant(name, new_name, bid, n_dims)
-
- def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ def modify_tensors(self, data_torch, name, bid):
# skip language model tensors
if name.startswith("lfm."):
return []
if any(p in name for p in ["codebook_offsets", "depth_embeddings", "depth_linear", "depthformer"]):
return []
- # fold running_mean, running_var and eps into weight and bias for batch_norm
- if "batch_norm" in name:
- if self._batch_norm_tensors is None:
- self._batch_norm_tensors = [{} for _ in range(self.block_count)]
- assert bid is not None
- self._batch_norm_tensors[bid][name] = data_torch
-
- if len(self._batch_norm_tensors[bid]) < 5:
- return []
-
- weight = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.weight"]
- bias = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.bias"]
- running_mean = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_mean"]
- running_var = self._batch_norm_tensors[bid][f"conformer.layers.{bid}.conv.batch_norm.running_var"]
- eps = 1e-5 # default value
-
- a = weight / torch.sqrt(running_var + eps)
- b = bias - running_mean * a
- return [
- (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.weight"), a),
- (self.map_tensor_name(f"conformer.layers.{bid}.conv.batch_norm.bias"), b),
- ]
-
- # reshape conv weights
- if name.startswith("conformer.pre_encode.conv.") and name.endswith(".bias"):
- data_torch = data_torch[:, None, None]
- if "conv.depthwise_conv" in name and name.endswith(".weight"):
- assert data_torch.shape[1] == 1
- data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[2])
- if "conv.pointwise_conv" in name and name.endswith(".weight"):
- assert data_torch.shape[2] == 1
- data_torch = data_torch.reshape(data_torch.shape[0], data_torch.shape[1])
-
- return [(self.map_tensor_name(name), data_torch)]
+ return super().modify_tensors(data_torch, name, bid)
@ModelBase.register("SmallThinkerForCausalLM")
DATASETS = "imatrix.datasets"
class Clip:
- PROJECTOR_TYPE = "clip.projector_type"
- HAS_VISION_ENCODER = "clip.has_vision_encoder"
- HAS_AUDIO_ENCODER = "clip.has_audio_encoder"
- HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
+ PROJECTOR_TYPE = "clip.projector_type"
+ HAS_VISION_ENCODER = "clip.has_vision_encoder"
+ HAS_AUDIO_ENCODER = "clip.has_audio_encoder"
+ HAS_LLAVA_PROJECTOR = "clip.has_llava_projector"
class ClipVision:
+ PROJECTOR_TYPE = "clip.vision.projector_type" # for mixed modality models
IMAGE_SIZE = "clip.vision.image_size"
PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size"
PATCH_SIZE = "clip.vision.patch_size"
SCALE_FACTOR = "clip.vision.projector.scale_factor"
class ClipAudio:
+ PROJECTOR_TYPE = "clip.audio.projector_type" # for mixed modality models
NUM_MEL_BINS = "clip.audio.num_mel_bins"
EMBEDDING_LENGTH = "clip.audio.embedding_length"
FEED_FORWARD_LENGTH = "clip.audio.feed_forward_length"
RESAMPLER = auto()
GLM_EDGE = auto()
MERGER = auto()
+ GEMMA3N = auto()
GEMMA3 = auto()
QWEN3VL = auto()
COGVLM = auto()
V_MM_INP_NORM = auto()
V_MM_INP_PROJ = auto() # gemma3
V_MM_SOFT_EMB_NORM = auto() # gemma3
+ V_MM_EMBEDDING = auto() # gemma3n
+ V_MM_HARD_EMB_NORM = auto() # gemma3n
+ V_ENC_CONV_STEM = auto() # gemma3n
+ V_ENC_CONV_STEM_NORM = auto() # gemma3n
+ V_ENC_MSFA_EXP = auto() # gemma3n
+ V_ENC_MSFA_EXP_NORM = auto() # gemma3n
+ V_ENC_MSFA_PROJ = auto() # gemma3n
+ V_ENC_MSFA_PROJ_NORM = auto() # gemma3n
+ V_ENC_MSFA_NORM = auto() # gemma3n
V_RESMPL_POS_EMBD_K = auto() # minicpmv
V_RESMPL_ATTN_Q = auto() # minicpmv
V_RESMPL_ATTN_K = auto() # minicpmv
V_TOK_BOI = auto() # cogvlm
V_TOK_EOI = auto() # cogvlm
# audio (mtmd)
- A_ENC_EMBD_POS = auto()
- A_ENC_EMBD_NORM = auto()
- A_ENC_EMBD_TO_LOGITS = auto()
- A_ENC_CONV1D = auto()
- A_PRE_NORM = auto()
- A_POST_NORM = auto()
- A_ENC_ATTN_Q = auto()
- A_ENC_ATTN_K = auto()
- A_ENC_ATTN_V = auto()
- A_ENC_INPUT_NORM = auto()
- A_ENC_OUTPUT = auto()
- A_ENC_OUTPUT_NORM = auto()
- A_ENC_FFN_UP = auto()
- A_ENC_FFN_NORM = auto()
- A_ENC_FFN_GATE = auto()
- A_ENC_FFN_DOWN = auto()
- A_ENC_FFN_UP_1 = auto()
- A_ENC_FFN_NORM_1 = auto()
- A_ENC_FFN_GATE_1 = auto()
- A_ENC_FFN_DOWN_1 = auto()
- A_MMPROJ = auto()
- A_MMPROJ_FC = auto()
- A_MM_NORM_PRE = auto()
- A_MM_NORM_MID = auto()
+ A_ENC_EMBD_POS = auto()
+ A_ENC_EMBD_NORM = auto()
+ A_ENC_EMBD_TO_LOGITS = auto() # lfm2
+ A_ENC_CONV1D = auto()
+ A_ENC_CONV1D_NORM = auto() # gemma3n
+ A_PRE_NORM = auto()
+ A_POST_NORM = auto()
+ A_ENC_LAYER_PRE_NORM = auto() # gemma3n
+ A_ENC_ATTN_Q = auto()
+ A_ENC_ATTN_K = auto()
+ A_ENC_ATTN_V = auto()
+ A_ENC_PER_DIM_SCALE = auto() # gemma3n
+ A_ENC_INPUT_NORM = auto()
+ A_ENC_OUTPUT = auto()
+ A_ENC_OUTPUT_NORM = auto()
+ A_ENC_FFN_UP = auto()
+ A_ENC_FFN_NORM = auto()
+ A_ENC_FFN_POST_NORM = auto() # gemma3n
+ A_ENC_FFN_SCALE = auto() # gemma3n
+ A_ENC_FFN_GATE = auto()
+ A_ENC_FFN_DOWN = auto()
+ A_ENC_FFN_UP_1 = auto() # lfm2, gemma3n
+ A_ENC_FFN_NORM_1 = auto() # lfm2, gemma3n (pre-norm)
+ A_ENC_FFN_POST_NORM_1 = auto() # gemma3n
+ A_ENC_FFN_SCALE_1 = auto() # gemma3n
+ A_ENC_FFN_GATE_1 = auto() # lfm2, gemma3n
+ A_ENC_FFN_DOWN_1 = auto() # lfm2, gemma3n
+ A_MMPROJ = auto()
+ A_MMPROJ_FC = auto()
+ A_MM_NORM_PRE = auto()
+ A_MM_NORM_MID = auto()
+ A_MM_EMBEDDING = auto() # gemma3n
+ A_MM_HARD_EMB_NORM = auto() # gemma3n
+ A_MM_SOFT_EMB_NORM = auto() # gemma3n
+ A_MM_INP_PROJ = auto() # gemma3n
# nextn/mtp
NEXTN_EH_PROJ = auto()
NEXTN_EMBED_TOKENS = auto()
MODEL_TENSOR.V_MM_POST_NORM: "mm.post_norm",
MODEL_TENSOR.V_MM_INP_PROJ: "mm.input_projection",
MODEL_TENSOR.V_MM_INP_NORM: "mm.input_norm",
- MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm",
+ MODEL_TENSOR.V_MM_SOFT_EMB_NORM: "mm.soft_emb_norm", # gemma3n
+ MODEL_TENSOR.V_MM_EMBEDDING: "mm.embedding", # gemma3n
+ MODEL_TENSOR.V_MM_HARD_EMB_NORM: "mm.hard_emb_norm", # gemma3n
+ MODEL_TENSOR.V_ENC_CONV_STEM: "v.conv_stem.conv", # gemma3n
+ MODEL_TENSOR.V_ENC_CONV_STEM_NORM: "v.conv_stem.bn", # gemma3n
+ MODEL_TENSOR.V_ENC_MSFA_EXP: "v.msfa.ffn.pw_exp.conv", # gemma3n
+ MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: "v.msfa.ffn.pw_exp.bn", # gemma3n
+ MODEL_TENSOR.V_ENC_MSFA_PROJ: "v.msfa.ffn.pw_proj.conv", # gemma3n
+ MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: "v.msfa.ffn.pw_proj.bn", # gemma3n
+ MODEL_TENSOR.V_ENC_MSFA_NORM: "v.msfa.norm", # gemma3n
MODEL_TENSOR.V_RESMPL_POS_EMBD_K: "resampler.pos_embd_k",
MODEL_TENSOR.V_RESMPL_ATTN_Q: "resampler.attn.q",
MODEL_TENSOR.V_RESMPL_ATTN_K: "resampler.attn.k",
MODEL_TENSOR.A_ENC_EMBD_NORM: "a.position_embd_norm",
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS: "a.embd_to_logits",
MODEL_TENSOR.A_ENC_CONV1D: "a.conv1d.{bid}",
+ MODEL_TENSOR.A_ENC_CONV1D_NORM: "a.conv1d.{bid}.norm",
MODEL_TENSOR.A_PRE_NORM: "a.pre_ln",
MODEL_TENSOR.A_POST_NORM: "a.post_ln",
+ MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: "a.blk.{bid}.layer_pre_norm",
MODEL_TENSOR.A_ENC_ATTN_Q: "a.blk.{bid}.attn_q",
MODEL_TENSOR.A_ENC_ATTN_K: "a.blk.{bid}.attn_k",
MODEL_TENSOR.A_ENC_ATTN_V: "a.blk.{bid}.attn_v",
+ MODEL_TENSOR.A_ENC_PER_DIM_SCALE: "a.blk.{bid}.per_dim_scale",
MODEL_TENSOR.A_ENC_INPUT_NORM: "a.blk.{bid}.ln1",
MODEL_TENSOR.A_ENC_OUTPUT: "a.blk.{bid}.attn_out",
MODEL_TENSOR.A_ENC_OUTPUT_NORM: "a.blk.{bid}.ln2",
MODEL_TENSOR.A_ENC_FFN_NORM: "a.blk.{bid}.ffn_norm",
+ MODEL_TENSOR.A_ENC_FFN_POST_NORM: "a.blk.{bid}.ffn_post_norm",
+ MODEL_TENSOR.A_ENC_FFN_SCALE: "a.blk.{bid}.ffn_scale",
MODEL_TENSOR.A_ENC_FFN_UP: "a.blk.{bid}.ffn_up",
MODEL_TENSOR.A_ENC_FFN_GATE: "a.blk.{bid}.ffn_gate",
MODEL_TENSOR.A_ENC_FFN_DOWN: "a.blk.{bid}.ffn_down",
MODEL_TENSOR.A_ENC_FFN_NORM_1: "a.blk.{bid}.ffn_norm_1",
+ MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: "a.blk.{bid}.ffn_post_norm_1",
+ MODEL_TENSOR.A_ENC_FFN_SCALE_1: "a.blk.{bid}.ffn_scale_1",
MODEL_TENSOR.A_ENC_FFN_UP_1: "a.blk.{bid}.ffn_up_1",
MODEL_TENSOR.A_ENC_FFN_GATE_1: "a.blk.{bid}.ffn_gate_1",
MODEL_TENSOR.A_ENC_FFN_DOWN_1: "a.blk.{bid}.ffn_down_1",
MODEL_TENSOR.A_MMPROJ_FC: "mm.a.fc",
MODEL_TENSOR.A_MM_NORM_PRE: "mm.a.norm_pre",
MODEL_TENSOR.A_MM_NORM_MID: "mm.a.norm_mid",
+ MODEL_TENSOR.A_MM_INP_PROJ: "mm.a.input_projection", # gemma3n
+ MODEL_TENSOR.A_MM_SOFT_EMB_NORM: "mm.a.soft_emb_norm", # gemma3n
+ MODEL_TENSOR.A_MM_EMBEDDING: "mm.a.embedding", # gemma3n
+ MODEL_TENSOR.A_MM_HARD_EMB_NORM: "mm.a.hard_emb_norm", # gemma3n
# lfm2 audio
MODEL_TENSOR.A_ENC_NORM_CONV: "a.blk.{bid}.norm_conv",
MODEL_TENSOR.A_ENC_LINEAR_POS: "a.blk.{bid}.linear_pos",
MODEL_TENSOR.V_MM_INP_PROJ,
MODEL_TENSOR.V_MM_INP_NORM,
MODEL_TENSOR.V_MM_SOFT_EMB_NORM,
+ MODEL_TENSOR.V_MM_EMBEDDING,
+ MODEL_TENSOR.V_MM_HARD_EMB_NORM,
+ MODEL_TENSOR.V_ENC_CONV_STEM,
+ MODEL_TENSOR.V_ENC_CONV_STEM_NORM,
+ MODEL_TENSOR.V_ENC_MSFA_EXP,
+ MODEL_TENSOR.V_ENC_MSFA_EXP_NORM,
+ MODEL_TENSOR.V_ENC_MSFA_PROJ,
+ MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM,
+ MODEL_TENSOR.V_ENC_MSFA_NORM,
MODEL_TENSOR.V_RESMPL_POS_EMBD_K,
MODEL_TENSOR.V_RESMPL_ATTN_Q,
MODEL_TENSOR.V_RESMPL_ATTN_K,
MODEL_TENSOR.A_ENC_EMBD_NORM,
MODEL_TENSOR.A_ENC_EMBD_TO_LOGITS,
MODEL_TENSOR.A_ENC_CONV1D,
+ MODEL_TENSOR.A_ENC_CONV1D_NORM,
MODEL_TENSOR.A_PRE_NORM,
MODEL_TENSOR.A_POST_NORM,
+ MODEL_TENSOR.A_ENC_LAYER_PRE_NORM,
MODEL_TENSOR.A_ENC_ATTN_Q,
MODEL_TENSOR.A_ENC_ATTN_K,
MODEL_TENSOR.A_ENC_ATTN_V,
+ MODEL_TENSOR.A_ENC_PER_DIM_SCALE,
MODEL_TENSOR.A_ENC_INPUT_NORM,
MODEL_TENSOR.A_ENC_OUTPUT,
MODEL_TENSOR.A_ENC_OUTPUT_NORM,
MODEL_TENSOR.A_ENC_FFN_NORM,
+ MODEL_TENSOR.A_ENC_FFN_POST_NORM,
+ MODEL_TENSOR.A_ENC_FFN_SCALE,
MODEL_TENSOR.A_ENC_FFN_UP,
MODEL_TENSOR.A_ENC_FFN_GATE,
MODEL_TENSOR.A_ENC_FFN_DOWN,
MODEL_TENSOR.A_ENC_FFN_NORM_1,
+ MODEL_TENSOR.A_ENC_FFN_POST_NORM_1,
+ MODEL_TENSOR.A_ENC_FFN_SCALE_1,
MODEL_TENSOR.A_ENC_FFN_UP_1,
MODEL_TENSOR.A_ENC_FFN_GATE_1,
MODEL_TENSOR.A_ENC_FFN_DOWN_1,
MODEL_TENSOR.A_ENC_CONV_NORM,
MODEL_TENSOR.A_ENC_CONV_PW1,
MODEL_TENSOR.A_ENC_CONV_PW2,
+ MODEL_TENSOR.A_MM_INP_PROJ,
+ MODEL_TENSOR.A_MM_SOFT_EMB_NORM,
+ MODEL_TENSOR.A_MM_EMBEDDING,
+ MODEL_TENSOR.A_MM_HARD_EMB_NORM,
],
MODEL_ARCH.LLAMA: [
MODEL_TENSOR.TOKEN_EMBD,
class VisionProjectorType:
GEMMA3 = "gemma3"
+ GEMMA3NV = "gemma3nv"
+ GEMMA3NA = "gemma3na"
IDEFICS3 = "idefics3"
PIXTRAL = "pixtral"
LLAMA4 = "llama4"
def add_clip_projector_type(self, value: str) -> None:
self.add_string(Keys.Clip.PROJECTOR_TYPE, value)
+ def add_clip_vision_projector_type(self, value: str) -> None:
+ self.add_string(Keys.ClipVision.PROJECTOR_TYPE, value)
+
def add_vision_projection_dim(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.PROJECTION_DIM, value)
# audio models
+ def add_clip_audio_projector_type(self, value: str) -> None:
+ self.add_string(Keys.ClipAudio.PROJECTOR_TYPE, value)
+
def add_audio_projection_dim(self, value: int) -> None:
self.add_uint32(Keys.ClipAudio.PROJECTION_DIM, value)
MODEL_TENSOR.CONV1D: (
"backbone.embed", # roberta
),
+
+ MODEL_TENSOR.V_MM_EMBEDDING: (
+ "model.embed_vision.embedding", # gemma3n
+ ),
+ MODEL_TENSOR.V_MM_HARD_EMB_NORM: (
+ "model.embed_vision.hard_embedding_norm", # gemma3n
+ ),
+ MODEL_TENSOR.V_MM_INP_PROJ: (
+ "model.embed_vision.embedding_projection", # gemma3n
+ ),
+ MODEL_TENSOR.V_MM_SOFT_EMB_NORM: (
+ "model.embed_vision.soft_embedding_norm", # gemma3n
+ ),
+ MODEL_TENSOR.V_ENC_CONV_STEM: (
+ "model.vision_tower.timm_model.conv_stem.conv", # gemma3n
+ ),
+ MODEL_TENSOR.V_ENC_CONV_STEM_NORM: (
+ "model.vision_tower.timm_model.conv_stem.bn", # gemma3n
+ ),
+ MODEL_TENSOR.V_ENC_MSFA_EXP: (
+ "model.vision_tower.timm_model.msfa.ffn.pw_exp.conv", # gemma3n
+ ),
+ MODEL_TENSOR.V_ENC_MSFA_EXP_NORM: (
+ "model.vision_tower.timm_model.msfa.ffn.pw_exp.bn", # gemma3n
+ ),
+ MODEL_TENSOR.V_ENC_MSFA_PROJ: (
+ "model.vision_tower.timm_model.msfa.ffn.pw_proj.conv", # gemma3n
+ ),
+ MODEL_TENSOR.V_ENC_MSFA_PROJ_NORM: (
+ "model.vision_tower.timm_model.msfa.ffn.pw_proj.bn", # gemma3n
+ ),
+ MODEL_TENSOR.V_ENC_MSFA_NORM: (
+ "model.vision_tower.timm_model.msfa.norm", # gemma3n
+ ),
}
block_mappings_cfg: dict[MODEL_TENSOR, tuple[str, ...]] = {
MODEL_TENSOR.A_ENC_CONV1D: (
"audio_tower.conv{bid}", # ultravox
"conformer.pre_encode.conv.{bid}", # lfm2
+ "model.audio_tower.subsample_conv_projection.conv_{bid}.conv", # gemma3n
+ ),
+
+ MODEL_TENSOR.A_ENC_CONV1D_NORM: (
+ "model.audio_tower.subsample_conv_projection.conv_{bid}.norm", # gemma3n
),
MODEL_TENSOR.A_PRE_NORM: (),
MODEL_TENSOR.A_ENC_ATTN_Q: (
"audio_tower.layers.{bid}.self_attn.q_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_q", # lfm2
+ "conformer.layers.{bid}.attention.attn.q_proj", # gemma3n
),
MODEL_TENSOR.A_ENC_ATTN_K: (
"audio_tower.layers.{bid}.self_attn.k_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_k", # lfm2
+ "conformer.layers.{bid}.attention.attn.k_proj", # gemma3n
),
MODEL_TENSOR.A_ENC_ATTN_V: (
"audio_tower.layers.{bid}.self_attn.v_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_v", # lfm2
+ "conformer.layers.{bid}.attention.attn.v_proj", # gemma3n
+ ),
+
+ MODEL_TENSOR.A_ENC_PER_DIM_SCALE: (
+ "conformer.layers.{bid}.attention.attn.per_dim_scale", # gemma3n
+ ),
+
+ MODEL_TENSOR.A_ENC_LAYER_PRE_NORM: (
+ "conformer.layers.{bid}.norm", # gemma3n
),
MODEL_TENSOR.A_ENC_INPUT_NORM: (
"audio_tower.layers.{bid}.self_attn_layer_norm", # ultravox
"conformer.layers.{bid}.norm_self_att", # lfm2
+ "conformer.layers.{bid}.attention.pre_attn_norm", # gemma3n
),
MODEL_TENSOR.A_ENC_OUTPUT: (
"audio_tower.layers.{bid}.self_attn.out_proj", # ultravox
"conformer.layers.{bid}.self_attn.linear_out", # lfm2
+ "conformer.layers.{bid}.attention.post", # gemma3n
),
MODEL_TENSOR.A_ENC_OUTPUT_NORM: (
"audio_tower.layers.{bid}.final_layer_norm", # ultravox
"conformer.layers.{bid}.norm_out", # lfm2
+ "conformer.layers.{bid}.attention.post_norm", # gemma3n
),
MODEL_TENSOR.A_ENC_FFN_NORM: (
"conformer.layers.{bid}.norm_feed_forward1", # lfm2
+ "conformer.layers.{bid}.ffw_layer_start.pre_layer_norm", # gemma3n
+ ),
+
+ MODEL_TENSOR.A_ENC_FFN_POST_NORM: (
+ "conformer.layers.{bid}.ffw_layer_start.post_layer_norm", # gemma3n
+ ),
+
+ MODEL_TENSOR.A_ENC_FFN_SCALE: (
+ "conformer.layers.{bid}.ffw_layer_start.post_layer_scale", # gemma3n
),
MODEL_TENSOR.A_ENC_FFN_UP: (
"audio_tower.layers.{bid}.fc1", # ultravox
"conformer.layers.{bid}.feed_forward1.linear1", # lfm2
+ "conformer.layers.{bid}.ffw_layer_start.ffw_layer_1", # gemma3n
),
MODEL_TENSOR.A_ENC_FFN_GATE: (),
MODEL_TENSOR.A_ENC_FFN_DOWN: (
"audio_tower.layers.{bid}.fc2", # ultravox
"conformer.layers.{bid}.feed_forward1.linear2", # lfm2
+ "conformer.layers.{bid}.ffw_layer_start.ffw_layer_2", # gemma3n
),
MODEL_TENSOR.A_ENC_FFN_UP_1: (
"conformer.layers.{bid}.feed_forward2.linear1", # lfm2
+ "conformer.layers.{bid}.ffw_layer_end.ffw_layer_1", # gemma3n
),
MODEL_TENSOR.A_ENC_FFN_DOWN_1: (
"conformer.layers.{bid}.feed_forward2.linear2", # lfm2
+ "conformer.layers.{bid}.ffw_layer_end.ffw_layer_2", # gemma3n
),
MODEL_TENSOR.A_ENC_FFN_NORM_1: (
"conformer.layers.{bid}.norm_feed_forward2", # lfm2
+ "conformer.layers.{bid}.ffw_layer_end.pre_layer_norm", # gemma3n
+ ),
+
+ MODEL_TENSOR.A_ENC_FFN_POST_NORM_1: (
+ "conformer.layers.{bid}.ffw_layer_end.post_layer_norm", # gemma3n
+ ),
+
+ MODEL_TENSOR.A_ENC_FFN_SCALE_1: (
+ "conformer.layers.{bid}.ffw_layer_end.post_layer_scale", # gemma3n
),
MODEL_TENSOR.A_ENC_LINEAR_POS: (
"conformer.layers.{bid}.self_attn.linear_pos", # lfm2
+ "conformer.layers.{bid}.attention.attn.relative_position_embedding.pos_proj", # gemma3n
),
MODEL_TENSOR.A_ENC_POS_BIAS_U: (
MODEL_TENSOR.A_ENC_OUT: (
"conformer.pre_encode.out", # lfm2
+ "model.audio_tower.subsample_conv_projection.input_proj_linear", # gemma3n
),
# note: some tensors below has "audio." pseudo-prefix, to prevent conflicts with vision tensors
MODEL_TENSOR.A_ENC_CONV_DW: (
"conformer.layers.{bid}.conv.depthwise_conv", # lfm2
+ "conformer.layers.{bid}.lconv1d.depthwise_conv1d", # gemma3n
),
MODEL_TENSOR.A_ENC_CONV_NORM: (
"conformer.layers.{bid}.conv.batch_norm", # lfm2
+ "conformer.layers.{bid}.lconv1d.pre_layer_norm", # gemma3n
),
MODEL_TENSOR.A_ENC_CONV_PW1: (
"conformer.layers.{bid}.conv.pointwise_conv1", # lfm2
+ "conformer.layers.{bid}.lconv1d.linear_start", # gemma3n
),
MODEL_TENSOR.A_ENC_CONV_PW2: (
"conformer.layers.{bid}.conv.pointwise_conv2", # lfm2
+ "conformer.layers.{bid}.lconv1d.linear_end", # gemma3n
),
MODEL_TENSOR.A_ENC_NORM_CONV: (
"conformer.layers.{bid}.norm_conv", # lfm2
+ "conformer.layers.{bid}.lconv1d.conv_norm", # gemma3n
+ ),
+
+ MODEL_TENSOR.A_MM_EMBEDDING: (
+ "model.embed_audio.embedding", # gemma3n
+ ),
+ MODEL_TENSOR.A_MM_HARD_EMB_NORM: (
+ "model.embed_audio.hard_embedding_norm", # gemma3n
+ ),
+ MODEL_TENSOR.A_MM_INP_PROJ: (
+ "model.embed_audio.embedding_projection", # gemma3n
+ ),
+ MODEL_TENSOR.A_MM_SOFT_EMB_NORM: (
+ "model.embed_audio.soft_embedding_norm", # gemma3n
),
# NextN/MTP tensors for GLM4_MOE
inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, n_tokens);
inp_per_layer = ggml_scale(ctx0, inp_per_layer, sqrtf((float) n_embd_altup));
cb(inp_per_layer, "inp_per_layer_selected", -1);
+ res->add_input(std::move(inp));
} else {
- GGML_ABORT("TODO: support embd input");
+ // Vision embedding path: use padding token (ID=0) embedding
+ const int64_t embd_size = model.tok_embd_per_layer->ne[0]; // n_embd_altup * n_layer
+
+ // Extract and dequantize padding token embedding (column 0)
+ ggml_tensor * padding_q = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
+ ggml_tensor * padding_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size);
+ inp_per_layer = ggml_cpy(ctx0, padding_q, padding_f32);
+
+ // Reshape to [n_embd_altup, n_layer, 1]
+ inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
+ cb(inp_per_layer, "inp_per_layer_vision", -1);
}
- res->add_input(std::move(inp));
return inp_per_layer;
}
-1); // [n_embd_altup, n_layer, n_tokens]
cb(per_layer_proj, "per_layer_proj", -1);
- inp_per_layer = ggml_add(ctx0, inp_per_layer, per_layer_proj);
+ inp_per_layer = ggml_add(ctx0, per_layer_proj, inp_per_layer);
inp_per_layer = ggml_scale(ctx0, inp_per_layer, per_layer_input_scale);
cb(inp_per_layer, "inp_per_layer", -1);
models/qwen3vl.cpp
models/siglip.cpp
models/whisper-enc.cpp
+ models/mobilenetv5.cpp
models/youtuvl.cpp
)
#define TN_CONV_PW1 "%s.blk.%d.conv_pw1.%s"
#define TN_CONV_PW2 "%s.blk.%d.conv_pw2.%s"
+// mobilenetv5 (gemma3n) definitions
+#define TN_MNV5_STEM_CONV "v.conv_stem.conv.weight"
+#define TN_MNV5_STEM_BIAS "v.conv_stem.conv.bias"
+#define TN_MNV5_STEM_BN "v.conv_stem.bn.weight"
+
+// Stage 0 Block (Edge Residual)
+#define TN_MNV5_BLK_S0_EXP_W "v.blk.%d.%d.conv_exp.weight"
+#define TN_MNV5_BLK_S0_BN1_W "v.blk.%d.%d.bn1.weight"
+#define TN_MNV5_BLK_S0_PWL_W "v.blk.%d.%d.conv_pwl.weight"
+#define TN_MNV5_BLK_S0_BN2_W "v.blk.%d.%d.bn2.weight"
+
+// Stage 1+ Block (Universal Inverted Residual)
+#define TN_MNV5_BLK_DW_START_W "v.blk.%d.%d.dw_start.conv.weight"
+#define TN_MNV5_BLK_DW_START_BN "v.blk.%d.%d.dw_start.bn.weight"
+#define TN_MNV5_BLK_DW_MID_W "v.blk.%d.%d.dw_mid.conv.weight"
+#define TN_MNV5_BLK_DW_MID_BN "v.blk.%d.%d.dw_mid.bn.weight"
+#define TN_MNV5_BLK_PW_EXP_W "v.blk.%d.%d.pw_exp.conv.weight"
+#define TN_MNV5_BLK_PW_EXP_BN "v.blk.%d.%d.pw_exp.bn.weight"
+#define TN_MNV5_BLK_PW_PROJ_W "v.blk.%d.%d.pw_proj.conv.weight"
+#define TN_MNV5_BLK_PW_PROJ_BN "v.blk.%d.%d.pw_proj.bn.weight"
+#define TN_MNV5_BLK_LAYER_SCALE "v.blk.%d.%d.layer_scale.gamma"
+
+// Attention Components
+#define TN_MNV5_ATTN_Q_W "v.blk.%d.%d.attn.query.proj.weight"
+#define TN_MNV5_ATTN_K_W "v.blk.%d.%d.attn.key.proj.weight"
+#define TN_MNV5_ATTN_V_W "v.blk.%d.%d.attn.value.proj.weight"
+#define TN_MNV5_ATTN_O_W "v.blk.%d.%d.attn.output.proj.weight"
+#define TN_MNV5_ATTN_K_DW "v.blk.%d.%d.attn.key.down_conv.weight"
+#define TN_MNV5_ATTN_K_NORM "v.blk.%d.%d.attn.key.norm.weight"
+#define TN_MNV5_ATTN_V_DW "v.blk.%d.%d.attn.value.down_conv.weight"
+#define TN_MNV5_ATTN_V_NORM "v.blk.%d.%d.attn.value.norm.weight"
+#define TN_MNV5_ATTN_NORM "v.blk.%d.%d.norm.weight" // Block norm used in attn blocks
+
+// MSFA
+#define TN_MNV5_MSFA_FFN_EXP_W "v.msfa.ffn.pw_exp.conv.weight"
+#define TN_MNV5_MSFA_FFN_EXP_BN "v.msfa.ffn.pw_exp.bn.weight"
+#define TN_MNV5_MSFA_FFN_PROJ_W "v.msfa.ffn.pw_proj.conv.weight"
+#define TN_MNV5_MSFA_FFN_PROJ_BN "v.msfa.ffn.pw_proj.bn.weight"
+#define TN_MNV5_MSFA_NORM "v.msfa.norm.weight"
+
+
// align x to upper multiple of n
#define CLIP_ALIGN(x, n) ((((x) + (n) - 1) / (n)) * (n))
PROJECTOR_TYPE_QWEN2VL,
PROJECTOR_TYPE_QWEN3VL,
PROJECTOR_TYPE_GEMMA3,
+ PROJECTOR_TYPE_GEMMA3NV,
+ PROJECTOR_TYPE_GEMMA3NA,
PROJECTOR_TYPE_IDEFICS3,
PROJECTOR_TYPE_PIXTRAL,
PROJECTOR_TYPE_QWEN25VL,
{ PROJECTOR_TYPE_QWEN25VL, "qwen2.5vl_merger"},
{ PROJECTOR_TYPE_QWEN3VL, "qwen3vl_merger"},
{ PROJECTOR_TYPE_GEMMA3, "gemma3"},
+ { PROJECTOR_TYPE_GEMMA3NV, "gemma3nv"},
+ { PROJECTOR_TYPE_GEMMA3NA, "gemma3na"},
{ PROJECTOR_TYPE_IDEFICS3, "idefics3"},
{ PROJECTOR_TYPE_PIXTRAL, "pixtral"},
{ PROJECTOR_TYPE_ULTRAVOX, "ultravox"},
}
};
+// Expanded MobileNetV5 block structure for Gemma3n vision encoder
+struct mobilenetv5_block {
+ // Stage 0 (Edge Residual)
+ ggml_tensor * s0_conv_exp_w = nullptr;
+ ggml_tensor * s0_bn1_w = nullptr;
+ ggml_tensor * s0_conv_pwl_w = nullptr;
+ ggml_tensor * s0_bn2_w = nullptr;
+
+ // Stage 1+ (Universal Inverted Residual)
+ ggml_tensor * dw_start_w = nullptr;
+ ggml_tensor * dw_start_bn_w = nullptr;
+
+ ggml_tensor * pw_exp_w = nullptr;
+ ggml_tensor * pw_exp_bn_w = nullptr;
+
+ ggml_tensor * dw_mid_w = nullptr;
+ ggml_tensor * dw_mid_bn_w = nullptr;
+
+ ggml_tensor * pw_proj_w = nullptr;
+ ggml_tensor * pw_proj_bn_w = nullptr;
+
+ ggml_tensor * layer_scale_w = nullptr;
+
+ // Attention (MQA) components
+ ggml_tensor * attn_q_w = nullptr;
+ ggml_tensor * attn_k_w = nullptr;
+ ggml_tensor * attn_v_w = nullptr;
+ ggml_tensor * attn_o_w = nullptr;
+
+ // Optional downsampling/norm in attention
+ ggml_tensor * attn_k_dw_w = nullptr;
+ ggml_tensor * attn_k_norm_w = nullptr;
+ ggml_tensor * attn_v_dw_w = nullptr;
+ ggml_tensor * attn_v_norm_w = nullptr;
+
+ // Block norm (often present in attention blocks)
+ ggml_tensor * attn_norm_w = nullptr;
+};
+
struct clip_model {
clip_modality modality = CLIP_MODALITY_VISION;
projector_type proj_type = PROJECTOR_TYPE_MLP;
ggml_tensor * mm_input_proj_w = nullptr;
ggml_tensor * mm_soft_emb_norm_w = nullptr;
+ // mobilenetv5 for gemma3n
+ std::vector<mobilenetv5_block> mobilenet_blocks;
+ std::vector<int> mobilenet_stage_ends;
+ ggml_tensor * mobilenet_stem_conv_w = nullptr;
+ ggml_tensor * mobilenet_stem_conv_b = nullptr;
+ ggml_tensor * mobilenet_stem_norm_w = nullptr;
+ ggml_tensor * mm_post_proj_norm_w = nullptr;
+
+ // Multi-Scale Fusion Adapter (MSFA) components
+ ggml_tensor * msfa_concat_conv_w = nullptr;
+ ggml_tensor * msfa_concat_norm_w = nullptr;
+ ggml_tensor * msfa_ffn_expand_w = nullptr;
+ ggml_tensor * msfa_ffn_project_w = nullptr;
+ ggml_tensor * msfa_ffn_expand_bn = nullptr;
+ ggml_tensor * msfa_ffn_project_bn = nullptr;
+
+
// pixtral, glm4v
ggml_tensor * token_embd_img_break = nullptr;
ggml_tensor * mm_patch_merger_w = nullptr;
{
builder = std::make_unique<clip_graph_siglip>(ctx, img);
} break;
+ case PROJECTOR_TYPE_GEMMA3NV:
+ {
+ builder = std::make_unique<clip_graph_mobilenetv5>(ctx, img);
+ } break;
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
// test model (tinygemma3) has a different value, we optionally read it
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
} break;
+
+ case PROJECTOR_TYPE_GEMMA3NV:
+ {
+ // Gemma3n uses MobileNetV5 which produces 256 tokens (16x16)
+ // Similar configuration to Gemma3
+ hparams.n_merge = 1; // MobileNetV5 handles resizing internally
+ get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+ } break;
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
case PROJECTOR_TYPE_QWEN3VL:
model.position_embeddings = get_tensor(string_format(TN_POS_EMBD, prefix), false);
+ if (model.proj_type == PROJECTOR_TYPE_GEMMA3NV) {
+ hparams.n_layer = 0; // gemma3n does not use normal layer structure
+ }
+
// layers
model.layers.resize(hparams.n_layer);
for (int il = 0; il < hparams.n_layer; ++il) {
}
}
+
switch (model.proj_type) {
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM:
model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
} break;
+ case PROJECTOR_TYPE_GEMMA3NV:
+ {
+ model.mobilenet_stem_conv_w = get_tensor(TN_MNV5_STEM_CONV, false);
+ model.mobilenet_stem_conv_b = get_tensor(TN_MNV5_STEM_BIAS, false);
+ model.mobilenet_stem_norm_w = get_tensor(TN_MNV5_STEM_BN, false);
+
+ model.msfa_ffn_expand_w = get_tensor(TN_MNV5_MSFA_FFN_EXP_W, false);
+ model.msfa_ffn_expand_bn = get_tensor(TN_MNV5_MSFA_FFN_EXP_BN, false); // Consume BN if present but likely folded
+ model.msfa_ffn_project_w = get_tensor(TN_MNV5_MSFA_FFN_PROJ_W, false);
+ model.msfa_ffn_project_bn = get_tensor(TN_MNV5_MSFA_FFN_PROJ_BN, false);
+
+ model.msfa_concat_norm_w = get_tensor(TN_MNV5_MSFA_NORM, false);
+
+ // Dynamically load blocks stage by stage
+ for (int stage = 0; stage < 4; ++stage) {
+ int blocks_found_in_stage = 0;
+
+ for (int blk_idx = 0; ; ++blk_idx) {
+ bool found_block = false;
+ mobilenetv5_block block;
+
+ // 1. Check for Edge Residual (S0)
+ block.s0_conv_exp_w = get_tensor(string_format(TN_MNV5_BLK_S0_EXP_W, stage, blk_idx), false);
+ if (block.s0_conv_exp_w) {
+ found_block = true;
+ block.s0_bn1_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN1_W, stage, blk_idx), false);
+ block.s0_conv_pwl_w = get_tensor(string_format(TN_MNV5_BLK_S0_PWL_W, stage, blk_idx), false);
+ block.s0_bn2_w = get_tensor(string_format(TN_MNV5_BLK_S0_BN2_W, stage, blk_idx), false);
+ }
+ // 2. Check for UIR (Universal Inverted Residual)
+ else {
+ // Check for dw_start OR pw_exp (some UIR blocks skip dw_start)
+ block.dw_start_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_W, stage, blk_idx), false);
+ block.pw_exp_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_W, stage, blk_idx), false);
+
+ if (block.dw_start_w || block.pw_exp_w) {
+ found_block = true;
+ if (block.dw_start_w) {
+ block.dw_start_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_START_BN, stage, blk_idx), false);
+ }
+ if (block.pw_exp_w) {
+ block.pw_exp_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_EXP_BN, stage, blk_idx), false);
+ }
+ block.dw_mid_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_W, stage, blk_idx), false);
+ if (block.dw_mid_w) {
+ block.dw_mid_bn_w = get_tensor(string_format(TN_MNV5_BLK_DW_MID_BN, stage, blk_idx), false);
+ }
+ block.pw_proj_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_W, stage, blk_idx), false);
+ if (block.pw_proj_w) {
+ block.pw_proj_bn_w = get_tensor(string_format(TN_MNV5_BLK_PW_PROJ_BN, stage, blk_idx), false);
+ }
+ block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
+ }
+ }
+
+ // 3. Check for Attention (MQA)
+ // Even if UIR/Edge check failed, this might be a pure attention block
+ ggml_tensor* attn_q_check = get_tensor(string_format(TN_MNV5_ATTN_Q_W, stage, blk_idx), false);
+ if (attn_q_check) {
+ found_block = true;
+ block.attn_q_w = attn_q_check;
+ block.attn_k_w = get_tensor(string_format(TN_MNV5_ATTN_K_W, stage, blk_idx), false);
+ block.attn_v_w = get_tensor(string_format(TN_MNV5_ATTN_V_W, stage, blk_idx), false);
+ block.attn_o_w = get_tensor(string_format(TN_MNV5_ATTN_O_W, stage, blk_idx), false);
+ block.attn_k_dw_w = get_tensor(string_format(TN_MNV5_ATTN_K_DW, stage, blk_idx), false);
+ block.attn_k_norm_w = get_tensor(string_format(TN_MNV5_ATTN_K_NORM, stage, blk_idx), false);
+ block.attn_v_dw_w = get_tensor(string_format(TN_MNV5_ATTN_V_DW, stage, blk_idx), false);
+ block.attn_v_norm_w = get_tensor(string_format(TN_MNV5_ATTN_V_NORM, stage, blk_idx), false);
+ block.attn_norm_w = get_tensor(string_format(TN_MNV5_ATTN_NORM, stage, blk_idx), false);
+ // Note: Attention blocks also have layer_scale, load it if not already loaded by UIR check
+ if (!block.layer_scale_w) {
+ block.layer_scale_w = get_tensor(string_format(TN_MNV5_BLK_LAYER_SCALE, stage, blk_idx), false);
+ }
+ }
+
+ if (found_block) {
+ model.mobilenet_blocks.push_back(block);
+ blocks_found_in_stage++;
+ } else {
+ // End of blocks for this stage
+ break;
+ }
+ }
+
+ // Track where this stage ends in the flat vector
+ if (blocks_found_in_stage > 0) {
+ model.mobilenet_stage_ends.push_back(model.mobilenet_blocks.size() - 1);
+ LOG_INF("%s: Stage %d ended at global block index %zu\n", __func__, stage, model.mobilenet_blocks.size() - 1);
+ }
+ }
+ model.mm_input_proj_w = get_tensor(TN_MM_INP_PROJ);
+ model.mm_soft_emb_norm_w = get_tensor(TN_MM_SOFT_EMB_N);
+ } break;
case PROJECTOR_TYPE_IDEFICS3:
{
model.projection = get_tensor(TN_MM_PROJECTOR);
try {
clip_model_loader loader(fname);
+ bool skip_audio = false;
if (loader.has_vision) {
ctx_vision = new clip_ctx(ctx_params);
loader.warmup(*ctx_vision);
}
+ // TODO: we don't support audio for Gemma 3N, but GGUF contains audio tensors
+ // we can remove this check when we implement audio support for Gemma 3N
+ skip_audio = ctx_vision->model.proj_type == PROJECTOR_TYPE_GEMMA3NV;
+
// clip_debug_encode(ctx_vision, 24*14, 24*14, 0.5f);
}
- if (loader.has_audio) {
+ if (loader.has_audio && !skip_audio) {
ctx_audio = new clip_ctx(ctx_params);
loader.load_hparams(ctx_audio->model, CLIP_MODALITY_AUDIO);
loader.load_tensors(*ctx_audio);
res_imgs->entries.push_back(std::move(img_f32));
} break;
+ case PROJECTOR_TYPE_GEMMA3NV:
+ {
+ clip_image_u8 resized_image;
+ int sz = params.image_size;
+ img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
+ clip_image_f32_ptr img_f32(clip_image_f32_init());
+ normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
+ res_imgs->entries.push_back(std::move(img_f32));
+ } break;
+
case PROJECTOR_TYPE_JANUS_PRO:
{
// Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
int scale_factor = ctx->model.hparams.n_merge;
n_patches /= (scale_factor * scale_factor);
} break;
+ case PROJECTOR_TYPE_GEMMA3NV:
+ {
+ // MobileNetV5 MSFA adapter always outputs fixed 16x16 resolution
+ // regardless of input size (see architecture description)
+ n_patches = ctx->model.hparams.image_size / ctx->model.hparams.patch_size;
+ } break;
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_KIMIVL:
{
set_input_i32("patches", patches);
} break;
case PROJECTOR_TYPE_GEMMA3:
+ case PROJECTOR_TYPE_GEMMA3NV:
case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_INTERNVL:
case PROJECTOR_TYPE_QWEN2A:
// main path + deepstack paths
return ctx->model.mm_1_b->ne[0] * (1 + ctx->model.n_deepstack_layers);
case PROJECTOR_TYPE_GEMMA3:
+ case PROJECTOR_TYPE_GEMMA3NV:
return ctx->model.mm_input_proj_w->ne[0];
case PROJECTOR_TYPE_IDEFICS3:
return ctx->model.projection->ne[1];
}
int clip_is_minicpmv(const struct clip_ctx * ctx) {
+ // TODO: remove this function
if (ctx->proj_type() == PROJECTOR_TYPE_MINICPMV) {
return ctx->model.hparams.minicpmv_version;
}
}
bool clip_is_glm(const struct clip_ctx * ctx) {
+ // TODO: remove this function
return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
}
bool clip_is_mrope(const struct clip_ctx * ctx) {
- return ctx->proj_type() == PROJECTOR_TYPE_QWEN2VL
- || ctx->proj_type() == PROJECTOR_TYPE_QWEN25VL
- || ctx->proj_type() == PROJECTOR_TYPE_QWEN3VL
- || ctx->proj_type() == PROJECTOR_TYPE_GLM4V;
+ switch (ctx->proj_type()) {
+ case PROJECTOR_TYPE_QWEN2VL:
+ case PROJECTOR_TYPE_QWEN25VL:
+ case PROJECTOR_TYPE_QWEN3VL:
+ case PROJECTOR_TYPE_GLM4V:
+ return true;
+ default:
+ return false;
+ }
}
bool clip_is_llava(const struct clip_ctx * ctx) {
return ctx->model.hparams.has_llava_projector;
}
-bool clip_is_gemma3(const struct clip_ctx * ctx) {
- return ctx->proj_type() == PROJECTOR_TYPE_GEMMA3;
-}
-
bool clip_has_vision_encoder(const struct clip_ctx * ctx) {
return ctx->model.modality == CLIP_MODALITY_VISION;
}
}
bool clip_has_whisper_encoder(const struct clip_ctx * ctx) {
- return ctx->proj_type() == PROJECTOR_TYPE_ULTRAVOX
- || ctx->proj_type() == PROJECTOR_TYPE_QWEN2A
- || ctx->proj_type() == PROJECTOR_TYPE_GLMA
- || ctx->proj_type() == PROJECTOR_TYPE_VOXTRAL
- || ctx->proj_type() == PROJECTOR_TYPE_MUSIC_FLAMINGO;
+ switch (ctx->proj_type()) {
+ case PROJECTOR_TYPE_ULTRAVOX:
+ case PROJECTOR_TYPE_QWEN2A:
+ case PROJECTOR_TYPE_GLMA:
+ case PROJECTOR_TYPE_VOXTRAL:
+ case PROJECTOR_TYPE_MUSIC_FLAMINGO:
+ return true;
+ default:
+ return false;
+ }
}
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec) {
bool clip_is_glm(const struct clip_ctx * ctx);
bool clip_is_mrope(const struct clip_ctx * ctx);
bool clip_is_llava(const struct clip_ctx * ctx);
-bool clip_is_gemma3(const struct clip_ctx * ctx);
+// note for contributor: this clip_is_(model) pattern is deprecated
+// do NOT add new functions like this
bool clip_encode_float_image (struct clip_ctx * ctx, int n_threads, float * img, int h, int w, float * vec);
--- /dev/null
+#include "models.h"
+
+// Helpers for MobileNetV5 Blocks
+// RMS Norm 2D - normalizes over channels for each spatial position
+ggml_tensor * clip_graph_mobilenetv5::rms_norm_2d(ggml_tensor * inp, ggml_tensor * weight, float eps) {
+ // inp: [W, H, C, B]
+
+ ggml_tensor * cur = ggml_permute(ctx0, inp, 2, 1, 0, 3);
+ cur = ggml_cont(ctx0, cur);
+ cur = ggml_rms_norm(ctx0, cur, eps);
+
+ if (weight) {
+ cur = ggml_mul(ctx0, cur, weight);
+ }
+
+ cur = ggml_permute(ctx0, cur, 2, 1, 0, 3);
+ cur = ggml_cont(ctx0, cur);
+
+ return cur;
+}
+
+// Conv2dSame padding - asymmetric SAME padding like PyTorch/TF
+ggml_tensor* clip_graph_mobilenetv5::pad_same_2d(ggml_tensor* inp, int kernel_h, int kernel_w, int stride_h, int stride_w, int dilation_h, int dilation_w) {
+ const int64_t ih = inp->ne[1]; // height
+ const int64_t iw = inp->ne[0]; // width
+
+ // Calculate output size (ceil division)
+ const int64_t oh = (ih + stride_h - 1) / stride_h;
+ const int64_t ow = (iw + stride_w - 1) / stride_w;
+
+ // Calculate padding needed
+ const int64_t pad_h = std::max((int64_t)0, (oh - 1) * stride_h + (kernel_h - 1) * dilation_h + 1 - ih);
+ const int64_t pad_w = std::max((int64_t)0, (ow - 1) * stride_w + (kernel_w - 1) * dilation_w + 1 - iw);
+
+ // Split padding asymmetrically
+ const int pad_h_top = pad_h / 2;
+ const int pad_h_bottom = pad_h - pad_h_top;
+ const int pad_w_left = pad_w / 2;
+ const int pad_w_right = pad_w - pad_w_left;
+
+ // Apply padding if needed
+ // ggml_pad_ext: (ctx, tensor, lp0, rp0, lp1, rp1, lp2, rp2, lp3, rp3)
+ // For [W, H, C, B]: p0=width, p1=height, p2=channels, p3=batch
+ if (pad_h > 0 || pad_w > 0) {
+ inp = ggml_pad_ext(ctx0, inp,
+ pad_w_left, pad_w_right, // width padding (dim 0)
+ pad_h_top, pad_h_bottom, // height padding (dim 1)
+ 0, 0, // no channel padding (dim 2)
+ 0, 0); // no batch padding (dim 3)
+ }
+
+ return inp;
+}
+
+
+// Edge Residual Block (Stage 0)
+ggml_tensor * clip_graph_mobilenetv5::build_edge_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
+ ggml_tensor * cur = inp;
+
+ // 1. Expansion Conv (3x3)
+ if (stride == 2) {
+ // Case: Downsampling (Block 0)
+ // Replicates Conv2dSame(kernel=3, stride=2)
+ cur = pad_same_2d(cur, 3, 3, stride, stride);
+ cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 0, 0, 1, 1);
+ } else {
+ // Case: Normal 3x3 Block (Block 1, 2)
+ // Replicates Conv2d(kernel=3, stride=1, padding=1)
+ cur = ggml_conv_2d_direct(ctx0, block.s0_conv_exp_w, cur, stride, stride, 1, 1, 1, 1);
+ }
+
+ // BN + Activation
+ if (block.s0_bn1_w) cur = rms_norm_2d(cur, block.s0_bn1_w);
+ cur = ggml_gelu(ctx0, cur);
+
+ // 2. Pointwise Linear Conv (1x1)
+ // 1x1 Convs usually have padding=0 and stride=1
+ cur = ggml_conv_2d_direct(ctx0, block.s0_conv_pwl_w, cur, 1, 1, 0, 0, 1, 1);
+ if (block.s0_bn2_w) cur = rms_norm_2d(cur, block.s0_bn2_w);
+
+ // 3. Residual Connection
+ // Only apply residual if spatial dimensions and channels match (stride 1)
+ if (stride == 1 && inp->ne[2] == cur->ne[2] && inp->ne[0] == cur->ne[0]) {
+ cur = ggml_add(ctx0, cur, inp);
+ }
+
+ return cur;
+}
+
+// Universal Inverted Residual Block (Stage 1+)
+ggml_tensor * clip_graph_mobilenetv5::build_inverted_residual(ggml_tensor * inp, const mobilenetv5_block & block, int stride) {
+ ggml_tensor * cur = inp;
+
+ // 1. Depthwise Start (Optional)
+ // NOTE: dw_start always has stride=1 (no downsampling here)
+ if (block.dw_start_w) {
+ int k = block.dw_start_w->ne[0]; // 3 or 5
+ int p = k / 2;
+ cur = ggml_conv_2d_dw(ctx0, block.dw_start_w, cur, 1, 1, p, p, 1, 1);
+ if (block.dw_start_bn_w) cur = rms_norm_2d(cur, block.dw_start_bn_w);
+ }
+
+ // 2. Pointwise Expansion (1x1)
+ if (block.pw_exp_w) {
+ // Standard 1x1 conv, pad=0, stride=1
+ cur = ggml_conv_2d_direct(ctx0, block.pw_exp_w, cur, 1, 1, 0, 0, 1, 1);
+ if (block.pw_exp_bn_w) cur = rms_norm_2d(cur, block.pw_exp_bn_w);
+ cur = ggml_gelu(ctx0, cur);
+ }
+
+ // 3. Depthwise Mid (Optional)
+ // NOTE: dw_mid is where downsampling happens (stride=2 for first block of stage)
+ if (block.dw_mid_w) {
+ int k = block.dw_mid_w->ne[0]; // 3 or 5
+
+ if (stride > 1) {
+ // Case: Stride 2 (Downsample) -> Use Asymmetric "Same" Padding
+ cur = pad_same_2d(cur, k, k, stride, stride);
+ cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, 0, 0, 1, 1); // pad=0
+ } else {
+ // Case: Stride 1 -> Use Standard Symmetric Padding
+ int p = k / 2;
+ cur = ggml_conv_2d_dw(ctx0, block.dw_mid_w, cur, stride, stride, p, p, 1, 1);
+ }
+
+ if (block.dw_mid_bn_w) cur = rms_norm_2d(cur, block.dw_mid_bn_w);
+ cur = ggml_gelu(ctx0, cur);
+ }
+
+ // 4. Pointwise Projection (1x1)
+ if (block.pw_proj_w) {
+ cur = ggml_conv_2d_direct(ctx0, block.pw_proj_w, cur, 1, 1, 0, 0, 1, 1);
+ if (block.pw_proj_bn_w) cur = rms_norm_2d(cur, block.pw_proj_bn_w);
+ }
+
+ // Apply Layer Scaling if present
+ if (block.layer_scale_w) {
+ cur = ggml_mul(ctx0, cur, block.layer_scale_w);
+ }
+
+ // 5. Residual Connection
+ bool same_spatial = (inp->ne[0] == cur->ne[0]) && (inp->ne[1] == cur->ne[1]);
+ bool same_channel = (inp->ne[2] == cur->ne[2]);
+ if (same_spatial && same_channel) {
+ cur = ggml_add(ctx0, cur, inp);
+ }
+
+ return cur;
+}
+
+// Attention Block (MQA)
+ggml_tensor * clip_graph_mobilenetv5::build_mobilenet_attn(ggml_tensor * inp, const mobilenetv5_block & block) {
+ ggml_tensor * cur = inp;
+
+ // Norm
+ if (block.attn_norm_w) {
+ cur = rms_norm_2d(cur, block.attn_norm_w, 1e-6f);
+ }
+
+ // 1. Q Calculation
+ ggml_tensor * q = ggml_conv_2d_direct(ctx0, block.attn_q_w, cur, 1, 1, 0, 0, 1, 1);
+
+ // 2. K Calculation (Downsampled)
+ // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
+ ggml_tensor * k_inp = cur;
+ if (block.attn_k_dw_w) {
+ int k_size = block.attn_k_dw_w->ne[0]; // Usually 3
+ k_inp = pad_same_2d(cur, k_size, k_size, 2, 2); // Apply SAME padding
+ k_inp = ggml_conv_2d_dw(ctx0, block.attn_k_dw_w, k_inp, 2, 2, 0, 0, 1, 1); // padding=0
+ if (block.attn_k_norm_w) {
+ k_inp = rms_norm_2d(k_inp, block.attn_k_norm_w, 1e-6f);
+ }
+ }
+ ggml_tensor * k = ggml_conv_2d_direct(ctx0, block.attn_k_w, k_inp, 1, 1, 0, 0, 1, 1);
+
+ // 3. V Calculation (Downsampled)
+ // Uses Conv2dSame(640, 640, kernel_size=(3, 3), stride=(2, 2), groups=640)
+ ggml_tensor * v_inp = cur;
+ if (block.attn_v_dw_w) {
+ int v_size = block.attn_v_dw_w->ne[0]; // Usually 3
+ v_inp = pad_same_2d(cur, v_size, v_size, 2, 2); // Apply SAME padding
+ v_inp = ggml_conv_2d_dw(ctx0, block.attn_v_dw_w, v_inp, 2, 2, 0, 0, 1, 1); // padding=0
+ if (block.attn_v_norm_w) {
+ v_inp = rms_norm_2d(v_inp, block.attn_v_norm_w, 1e-6f);
+ }
+ }
+ ggml_tensor * v = ggml_conv_2d_direct(ctx0, block.attn_v_w, v_inp, 1, 1, 0, 0, 1, 1);
+
+ const int W = cur->ne[0]; const int H = cur->ne[1]; const int B = cur->ne[3];
+ const int D = k->ne[2]; // Head dimension
+ const int n_head = q->ne[2] / D;
+ const int N = W * H;
+
+ // Process Q: [W, H, D*n_head, B] -> [D, N, n_head, B]
+ q = ggml_reshape_3d(ctx0, q, N, D*n_head, B);
+ q = ggml_reshape_4d(ctx0, q, N, D, n_head, B);
+ q = ggml_permute(ctx0, q, 1, 0, 2, 3); // [D, N, n_head, B]
+ q = ggml_cont(ctx0, q);
+
+ const int Wk = k->ne[0]; const int Hk = k->ne[1];
+ const int M = Wk * Hk;
+
+ // Process K: [Wk, Hk, D, B] -> [D, M, 1, B]
+ k = ggml_reshape_3d(ctx0, k, M, D, B);
+ k = ggml_reshape_4d(ctx0, k, M, D, 1, B);
+ k = ggml_permute(ctx0, k, 1, 0, 2, 3); // [D, M, 1, B]
+ k = ggml_cont(ctx0, k);
+
+ // Process V: [Wk, Hk, D, B] -> [M, D, 1, B]
+ v = ggml_reshape_3d(ctx0, v, M, D, B);
+ v = ggml_reshape_4d(ctx0, v, M, D, 1, B);
+ v = ggml_cont(ctx0, v); // [M, D, 1, B]
+
+ // Multi-Query Attention
+ float scale = 1.0f / sqrtf((float)D);
+
+ // Step 1: Compute Q @ K.T
+ ggml_tensor * scores = ggml_mul_mat(ctx0, k, q);
+
+ scores = ggml_scale(ctx0, scores, scale);
+
+ scores = ggml_soft_max(ctx0, scores);
+
+ ggml_tensor * kqv = ggml_mul_mat(ctx0, v, scores);
+
+ kqv = ggml_permute(ctx0, kqv, 1, 0, 2, 3);
+ kqv = ggml_cont(ctx0, kqv);
+
+
+ kqv = ggml_reshape_3d(ctx0, kqv, N, D * n_head, B);
+ kqv = ggml_reshape_4d(ctx0, kqv, W, H, D * n_head, B);
+ kqv = ggml_cont(ctx0, kqv);
+
+ // Output projection
+ cur = ggml_conv_2d_direct(ctx0, block.attn_o_w, kqv, 1, 1, 0, 0, 1, 1);
+
+ // Residual & Layer Scale
+ if (inp->ne[0] == cur->ne[0] && inp->ne[2] == cur->ne[2]) {
+ if (block.layer_scale_w) {
+ cur = ggml_mul(ctx0, cur, block.layer_scale_w);
+ }
+ cur = ggml_add(ctx0, cur, inp);
+ }
+
+ return cur;
+}
+
+ggml_cgraph * clip_graph_mobilenetv5::build() {
+ ggml_tensor * inp = build_inp_raw();
+
+ // 1. Stem - Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
+ ggml_tensor * cur = pad_same_2d(inp, 3, 3, 2, 2); // Apply SAME padding
+
+ cur = ggml_conv_2d_direct(ctx0, model.mobilenet_stem_conv_w, cur, 2, 2, 0, 0, 1, 1); // padding=0
+ if (model.mobilenet_stem_conv_b) {
+ cur = ggml_add(ctx0, cur, model.mobilenet_stem_conv_b);
+ }
+ if (model.mobilenet_stem_norm_w) cur = rms_norm_2d(cur, model.mobilenet_stem_norm_w);
+ cur = ggml_gelu(ctx0, cur);
+
+
+ // 2. Blocks
+ std::vector<ggml_tensor*> intermediate_features;
+ const int total_blocks = model.mobilenet_blocks.size();
+
+ auto is_stage_start = [&](int i) {
+ if (i == 0) return true;
+ for (int end_idx : model.mobilenet_stage_ends) {
+ if (i == end_idx + 1) return true;
+ }
+ return false;
+ };
+
+ auto is_fusion_point = [&](int i) {
+ if (model.mobilenet_stage_ends.size() >= 4) {
+ if (i == model.mobilenet_stage_ends[2]) return true; // End of Stage 2
+ if (i == model.mobilenet_stage_ends[3]) return true; // End of Stage 3
+ } else {
+ if (i == total_blocks - 1) return true;
+ }
+ return false;
+ };
+
+ for (int i = 0; i < total_blocks; i++) {
+ const auto & block = model.mobilenet_blocks[i];
+ int stride = is_stage_start(i) ? 2 : 1;
+
+ if (block.s0_conv_exp_w) cur = build_edge_residual(cur, block, stride);
+ else if (block.attn_q_w) cur = build_mobilenet_attn(cur, block);
+ else cur = build_inverted_residual(cur, block, stride);
+
+ if (is_fusion_point(i)) {
+
+ intermediate_features.push_back(cur);
+ }
+ }
+
+ // 3. Multi-Scale Fusion Adapter (MSFA)
+ if (!intermediate_features.empty()) {
+
+ // A. Reference Resolution: PyTorch implementation uses inputs[0]
+ // We assume intermediate_features[0] is the "High Resolution" target.
+ // In MobileNet designs, this is typically the feature map with the smallest stride (e.g. 32x32).
+ ggml_tensor* target_feat = intermediate_features[0];
+ int high_res_w = target_feat->ne[0];
+ int high_res_h = target_feat->ne[1];
+
+ std::vector<ggml_tensor*> resized_feats;
+
+ // B. Resize inputs to match inputs[0] (High Resolution)
+ for (auto feat : intermediate_features) {
+ int feat_w = feat->ne[0];
+ int feat_h = feat->ne[1];
+
+ // PyTorch: if feat_size < high_resolution: interpolate
+ if (feat_w < high_res_w || feat_h < high_res_h) {
+ // Calculate scale factor.
+ // Note: PyTorch 'nearest' works on arbitrary float scales.
+ // ggml_upscale generally takes integer factors or target sizes depending on helper.
+ // Assuming standard power-of-2 scaling (e.g. 16 -> 32 means scale=2).
+ int scale_w = high_res_w / feat_w;
+ // int scale_h = high_res_h / feat_h;
+
+ // Safety check for non-integer scaling if strictly replicating
+ GGML_ASSERT(high_res_w % feat_w == 0);
+
+ // Upsample (Nearest Neighbor)
+ // 2 is the scale factor
+ feat = ggml_upscale(ctx0, feat, scale_w, ggml_scale_mode::GGML_SCALE_MODE_NEAREST);
+ }
+ resized_feats.push_back(feat);
+ }
+
+ // C. Concatenate at High Resolution (Channel Dim = 2 in ggml)
+ cur = resized_feats[0];
+ for (size_t k = 1; k < resized_feats.size(); ++k) {
+ cur = ggml_concat(ctx0, cur, resized_feats[k], 2);
+ }
+
+ // D. FFN (UniversalInvertedResidual)
+ // Structure: Expand Conv -> Norm -> GELU -> Project Conv -> Norm
+
+ // 1. Expansion
+ if (model.msfa_ffn_expand_w) {
+ // 1x1 Conv
+ cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_expand_w, cur, 1, 1, 0, 0, 1, 1);
+
+ if (model.msfa_ffn_expand_bn) {
+ cur = rms_norm_2d(cur, model.msfa_ffn_expand_bn);
+ }
+
+ cur = ggml_gelu(ctx0, cur);
+
+ }
+
+ // 2. Projection (No DW because kernel_size=0)
+ if (model.msfa_ffn_project_w) {
+ // 1x1 Conv
+ cur = ggml_conv_2d_direct(ctx0, model.msfa_ffn_project_w, cur, 1, 1, 0, 0, 1, 1);
+
+ // UniversalInvertedResidual typically has a norm after projection
+ if (model.msfa_ffn_project_bn) {
+ cur = rms_norm_2d(cur, model.msfa_ffn_project_bn);
+ }
+
+ }
+
+ // E. Final Downsample to Target Resolution (Output Resolution)
+ // PyTorch: matches self.output_resolution (e.g. 16x16)
+ const int target_out_res = 16;
+ int current_w = cur->ne[0];
+
+ if (current_w > target_out_res) {
+ int s = current_w / target_out_res;
+
+ GGML_ASSERT(current_w % target_out_res == 0);
+
+ // Avg Pool: Kernel=s, Stride=s
+ cur = ggml_pool_2d(ctx0, cur, GGML_OP_POOL_AVG, s, s, s, s, 0, 0);
+
+ }
+
+ // F. Final Norm
+ if (model.msfa_concat_norm_w) {
+ cur = rms_norm_2d(cur, model.msfa_concat_norm_w);
+
+ }
+ }
+
+ // 4. Gemma 3n Multimodal Projection (Embedder)
+ // Input: 'cur' is [Width, Height, Channels, Batch]
+ int W = cur->ne[0];
+ int H = cur->ne[1];
+ int C = cur->ne[2];
+ int B = cur->ne[3];
+
+ GGML_ASSERT(C == hparams.n_embd);
+
+ // 1. Permute and Flatten to [Channels, Tokens, Batch]
+ // PyTorch expects (Batch, Seq, Hidden), GGML usually processes (Hidden, Seq, Batch)
+ cur = ggml_permute(ctx0, cur, 2, 1, 0, 3); // -> [C, H, W, B]
+ cur = ggml_permute(ctx0, cur, 0, 2, 1, 3); // -> [C, W, H, B]
+ cur = ggml_cont(ctx0, cur);
+ cur = ggml_reshape_3d(ctx0, cur, C, W*H, B);
+ cur = ggml_cont(ctx0, cur);
+
+
+ // 2. FEATURE SCALING
+ // PyTorch: vision_outputs *= self.config.vision_config.hidden_size**0.5
+ const float scale_factor = sqrtf((float)C);
+ cur = ggml_scale(ctx0, cur, scale_factor);
+
+
+ // 3. SOFT EMBEDDING NORM
+ // PyTorch: self._norm(x) * self.weight
+ // We must normalize regardless, then multiply if weight exists.
+ {
+ const float eps = 1e-6f; // Gemma3n uses 1e-6
+ cur = ggml_rms_norm(ctx0, cur, eps);
+
+ if (model.mm_soft_emb_norm_w) {
+ // Weight shape is (2048,) -> Element-wise broadcast multiply
+ cur = ggml_mul(ctx0, cur, model.mm_soft_emb_norm_w);
+ }
+
+ }
+
+ // 4. PROJECTION
+ // PyTorch: embedding_projection = nn.Linear(vision_hidden, text_hidden, bias=False)
+ // Weight stored as [out_features, in_features] = [text_hidden_size, vision_hidden_size]
+ if (model.mm_input_proj_w) {
+ cur = ggml_mul_mat(ctx0, model.mm_input_proj_w, cur);
+ }
+
+ // 5. POST PROJECTION NORM
+ // PyTorch: embedding_post_projection_norm = Gemma3nRMSNorm(..., with_scale=False)
+ // with_scale=False means weight is registered as buffer with value 1.0
+ // So output = rms_norm(x) * 1.0 = rms_norm(x), magnitude ~1
+ {
+ const float eps = 1e-6f;
+ cur = ggml_rms_norm(ctx0, cur, eps);
+
+ if (model.mm_post_proj_norm_w) {
+ // If weight is loaded, multiply (should be ~1.0 anyway)
+ cur = ggml_mul(ctx0, cur, model.mm_post_proj_norm_w);
+ }
+ }
+
+ ggml_build_forward_expand(gf, cur);
+ return gf;
+}
clip_graph_glm4v(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
};
+
+struct clip_graph_mobilenetv5 : clip_graph {
+ clip_graph_mobilenetv5(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+
+ ggml_tensor * rms_norm_2d(
+ ggml_tensor * inp,
+ ggml_tensor * weight,
+ float eps = 1e-6f);
+
+ ggml_tensor* pad_same_2d(
+ ggml_tensor* inp,
+ int kernel_h,
+ int kernel_w,
+ int stride_h,
+ int stride_w,
+ int dilation_h = 1,
+ int dilation_w = 1);
+
+ ggml_tensor * build_edge_residual(
+ ggml_tensor * inp,
+ const mobilenetv5_block & block,
+ int stride);
+
+ ggml_tensor * build_inverted_residual(
+ ggml_tensor * inp,
+ const mobilenetv5_block & block,
+ int stride);
+
+ ggml_tensor * build_mobilenet_attn(
+ ggml_tensor * inp,
+ const mobilenetv5_block & block);
+};
}
// set boi/eoi
- if (proj == PROJECTOR_TYPE_GEMMA3) {
+ if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
// <start_of_image> ... (image embeddings) ... <end_of_image>
img_beg = "<start_of_image>";
img_end = "<end_of_image>";
}
bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
- if (ctx->ctx_v && clip_get_projector_type(ctx->ctx_v) == PROJECTOR_TYPE_GEMMA3) {
- return true;
+ switch (ctx->proj_type_v()) {
+ case PROJECTOR_TYPE_QWEN2VL:
+ case PROJECTOR_TYPE_QWEN25VL:
+ case PROJECTOR_TYPE_QWEN3VL:
+ case PROJECTOR_TYPE_YOUTUVL:
+ return true;
+ default:
+ return false;
}
- return false;
}
bool mtmd_decode_use_mrope(mtmd_context * ctx) {