yield from super().modify_tensors(data_torch, name, bid)
+@ModelBase.register(
+ "NemotronH_Nano_VL_V2",
+ "RADIOModel",
+)
+class NemotronNanoV2VLModel(MmprojModel):
+ # ViT-Huge architecture parameters for RADIO v2.5-h
+ _vit_hidden_size = 1280
+ _vit_intermediate_size = 5120
+ _vit_num_layers = 32
+ _vit_num_heads = 16
+
+ def get_vision_config(self) -> dict[str, Any] | None:
+ # RADIO config doesn't have standard ViT parameters, so they need to be constructed manually
+ vision_config = self.global_config.get("vision_config")
+ if vision_config is None:
+ return None
+ # Add ViT-H parameters
+ vision_config = {
+ **vision_config,
+ "hidden_size": self._vit_hidden_size,
+ "intermediate_size": self._vit_intermediate_size,
+ "num_hidden_layers": self._vit_num_layers,
+ "num_attention_heads": self._vit_num_heads,
+ "image_size": self.global_config.get("force_image_size", 512),
+ }
+ return vision_config
+
+ def set_gguf_parameters(self):
+ if "image_mean" not in self.preprocessor_config:
+ self.preprocessor_config["image_mean"] = [0.485, 0.456, 0.406]
+ if "image_std" not in self.preprocessor_config:
+ self.preprocessor_config["image_std"] = [0.229, 0.224, 0.225]
+
+ super().set_gguf_parameters()
+ hparams = self.global_config
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.NEMOTRON_V2_VL)
+ self.gguf_writer.add_vision_attention_layernorm_eps(1e-6)
+ self.gguf_writer.add_vision_use_gelu(True)
+ downsample_ratio = hparams.get("downsample_ratio", 0.5)
+ self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
+
+ def tensor_force_quant(self, name, new_name, bid, n_dims):
+ if ".position_embd." in new_name or "pos_embed" in new_name:
+ return gguf.GGMLQuantizationType.F32
+ return super().tensor_force_quant(name, new_name, bid, n_dims)
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ if "input_conditioner" in name:
+ return
+
+ # RADIO's pos_embed doesn't have .weight suffix, but clip.cpp expects it
+ if "patch_generator.pos_embed" in name:
+ if not name.endswith(".weight"):
+ name += ".weight"
+ # Downsample position embeddings for fixed 512x512 image size
+ import torch.nn.functional as F
+ n_embd = self.hparams["hidden_size"]
+ image_size = self.global_config.get("force_image_size", 512)
+ patch_size = self.hparams["patch_size"]
+ target_patches_per_side = image_size // patch_size # 32
+ max_patches_per_side = int((data_torch.shape[1]) ** 0.5) # 128
+ if target_patches_per_side != max_patches_per_side:
+ # Reshape to grid, interpolate, flatten back
+ data_torch = data_torch.reshape(1, max_patches_per_side, max_patches_per_side, n_embd)
+ data_torch = data_torch.permute(0, 3, 1, 2).float() # [1, n_embd, 128, 128]
+ data_torch = F.interpolate(data_torch, size=(target_patches_per_side, target_patches_per_side),
+ mode='bilinear', align_corners=True)
+ data_torch = data_torch.permute(0, 2, 3, 1) # [1, 32, 32, n_embd]
+ data_torch = data_torch.reshape(1, target_patches_per_side * target_patches_per_side, n_embd)
+
+ # Reshape linear patch embedding to conv2d format for ggml_conv_2d
+ # From [n_embd, patch_size*patch_size*3] to [n_embd, 3, patch_size, patch_size]
+ if "patch_generator.embedder" in name:
+ patch_size = self.hparams["patch_size"]
+ n_embd = self.hparams["hidden_size"]
+ data_torch = data_torch.reshape(n_embd, 3, patch_size, patch_size)
+
+ if name.startswith("vision_model.radio_model.model.") or name.startswith("mlp1."):
+ yield from super().modify_tensors(data_torch, name, bid)
+
+
@ModelBase.register("WavTokenizerDec")
class WavTokenizerDecModel(TextModel):
model_arch = gguf.MODEL_ARCH.WAVTOKENIZER_DEC
if hparams is None:
with open(dir_model / "config.json", "r", encoding="utf-8") as f:
hparams = json.load(f)
+ if "llm_config" in hparams:
+ hparams["text_config"] = hparams["llm_config"]
super().__init__(dir_model, *args, hparams=hparams, **kwargs)
self.d_model = self.find_hparam(["hidden_size", "d_model", "dim"])
self.d_inner = self.find_hparam(["mamba_d_ssm", "intermediate_size", "d_inner"], optional=True) or 2 * self.d_model
self.gguf_writer.add_add_bos_token(True)
def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ # Skip vision model and projector tensors for VLM models (handled by mmproj) (e.g., Nemotron Nano 12B v2 VL)
+ if name.startswith(("vision_model.", "mlp1.")):
+ return
+
+ # Strip language_model. prefix for VLM models (e.g., Nemotron Nano 12B v2 VL)
+ if name.startswith("language_model."):
+ name = name[len("language_model."):]
+
if self.is_moe and bid is not None:
if name.endswith("mixer.gate.e_score_correction_bias"):
new_name = name.replace("e_score_correction_bias", "e_score_correction.bias")
MUSIC_FLAMINGO = "musicflamingo" # audio
GLM4V = "glm4v"
YOUTUVL = "youtuvl"
+ NEMOTRON_V2_VL = "nemotron_v2_vl"
# Items here are (block size, type size)
"model.vision_tower.embeddings.cls_token", # Intern-S1
"vision_model.class_embedding", # llama 4
"model.vision.patch_embedding.cls_embedding", # cogvlm
+ "vision_model.radio_model.model.patch_generator.cls_token.token", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_EMBD_PATCH: (
"vision_tower.patch_embed.proj", # kimi-vl
"model.vision.patch_embedding.proj", # cogvlm
"siglip2.vision_model.embeddings.patch_embedding",
+ "vision_model.radio_model.model.patch_generator.embedder", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_EMBD_NORM: (
"visual.pos_embed", # qwen3vl
"model.vision.patch_embedding.position_embedding", # cogvlm
"visual.embeddings.position_embedding", # glm4v
+ "vision_model.radio_model.model.patch_generator.pos_embed", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_ATTN_QKV: (
"visual.blocks.{bid}.attn.qkv", # qwen3vl
"model.vision.transformer.layers.{bid}.attention.query_key_value", # cogvlm
- "vision_tower.encoder.blocks.{bid}.wqkv" # Kimi-K2.5
+ "vision_tower.encoder.blocks.{bid}.wqkv", # Kimi-K2.5
+ "vision_model.radio_model.model.blocks.{bid}.attn.qkv", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_ATTN_Q: (
"vision_tower.encoder.blocks.{bid}.norm0", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.input_layernorm", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.layer_norm1",
+ "vision_model.radio_model.model.blocks.{bid}.norm1", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_ATTN_O: (
"vision_tower.encoder.blocks.{bid}.wo", # kimi-vl
"model.vision.transformer.layers.{bid}.attention.dense", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.self_attn.out_proj", # youtuvl
+ "vision_model.radio_model.model.blocks.{bid}.attn.proj", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_POST_ATTN_NORM: (
"vision_tower.encoder.blocks.{bid}.norm1", # kimi-vl (norm0/norm1)
"model.vision.transformer.layers.{bid}.post_attention_layernorm", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.layer_norm2",
+ "vision_model.radio_model.model.blocks.{bid}.norm2", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_FFN_UP: (
"vision_tower.encoder.blocks.{bid}.mlp.fc0", # kimi-vl (fc0/fc1)
"model.vision.transformer.layers.{bid}.mlp.fc1", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc1",
+ "vision_model.radio_model.model.blocks.{bid}.mlp.fc1", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_ENC_FFN_GATE: (
"vision_tower.encoder.blocks.{bid}.mlp.fc1", # kimi-vl (fc0/fc1)
"model.vision.transformer.layers.{bid}.mlp.fc2", # cogvlm
"siglip2.vision_model.encoder.layers.{bid}.mlp.fc2",
+ "vision_model.radio_model.model.blocks.{bid}.mlp.fc2", # Nemotron Nano v2 VL
),
MODEL_TENSOR.V_LAYER_SCALE_1: (
models/internvl.cpp
models/kimivl.cpp
models/kimik25.cpp
+ models/nemotron-v2-vl.cpp
models/llama4.cpp
models/llava.cpp
models/minicpmv.cpp
PROJECTOR_TYPE_GLM4V,
PROJECTOR_TYPE_YOUTUVL,
PROJECTOR_TYPE_KIMIK25,
+ PROJECTOR_TYPE_NEMOTRON_V2_VL,
PROJECTOR_TYPE_UNKNOWN,
};
{ PROJECTOR_TYPE_GLM4V, "glm4v"},
{ PROJECTOR_TYPE_YOUTUVL, "youtuvl"},
{ PROJECTOR_TYPE_KIMIK25, "kimik25"},
+ { PROJECTOR_TYPE_NEMOTRON_V2_VL, "nemotron_v2_vl"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {
FFN_GELU_ERF,
FFN_SILU,
FFN_GELU_QUICK,
+ FFN_RELU_SQR,
};
enum norm_type {
cur = ggml_gelu_quick(ctx0, cur);
cb(cur, "ffn_gelu_quick", il);
} break;
+ case FFN_RELU_SQR:
+ {
+ cur = ggml_relu(ctx0, cur);
+ cur = ggml_sqr(ctx0, cur);
+ cb(cur, "ffn_relu_sqr", il);
+ } break;
}
if (down) {
{
builder = std::make_unique<clip_graph_internvl>(ctx, img);
} break;
+ case PROJECTOR_TYPE_NEMOTRON_V2_VL:
+ {
+ builder = std::make_unique<clip_graph_nemotron_v2_vl>(ctx, img);
+ } break;
case PROJECTOR_TYPE_LLAMA4:
{
builder = std::make_unique<clip_graph_llama4>(ctx, img);
}
} break;
case PROJECTOR_TYPE_INTERNVL:
+ case PROJECTOR_TYPE_NEMOTRON_V2_VL:
{
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
} break;
model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
model.mm_3_b = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "bias"));
} break;
+ case PROJECTOR_TYPE_NEMOTRON_V2_VL:
+ {
+ model.mm_0_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 0, "weight"));
+ model.mm_1_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 1, "weight"));
+ model.mm_3_w = get_tensor(string_format(TN_MVLM_PROJ_MLP, 3, "weight"));
+ } break;
case PROJECTOR_TYPE_GLMA:
{
model.conv1d_1_w = get_tensor(string_format(TN_CONV1D, 1, "weight"));
case PROJECTOR_TYPE_GLM_EDGE:
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
+ case PROJECTOR_TYPE_NEMOTRON_V2_VL:
{
clip_image_u8 resized_image;
int sz = params.image_size;
case PROJECTOR_TYPE_GEMMA3:
case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_INTERNVL:
+ case PROJECTOR_TYPE_NEMOTRON_V2_VL:
case PROJECTOR_TYPE_LLAMA4:
{
// both X and Y are downscaled by the scale factor
case PROJECTOR_TYPE_GEMMA3NV:
case PROJECTOR_TYPE_IDEFICS3:
case PROJECTOR_TYPE_INTERNVL:
+ case PROJECTOR_TYPE_NEMOTRON_V2_VL:
case PROJECTOR_TYPE_QWEN2A:
case PROJECTOR_TYPE_GLMA:
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_MUSIC_FLAMINGO:
return ctx->model.mm_2_w->ne[1];
case PROJECTOR_TYPE_INTERNVL:
+ case PROJECTOR_TYPE_NEMOTRON_V2_VL:
return ctx->model.mm_3_w->ne[1];
case PROJECTOR_TYPE_LLAMA4:
return ctx->model.mm_model_proj->ne[1];
ggml_cgraph * build() override;
};
+struct clip_graph_nemotron_v2_vl : clip_graph {
+ clip_graph_nemotron_v2_vl(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
+ ggml_cgraph * build() override;
+};
+
struct clip_graph_llama4 : clip_graph {
clip_graph_llama4(clip_ctx * ctx, const clip_image_f32 & img) : clip_graph(ctx, img) {}
ggml_cgraph * build() override;
--- /dev/null
+#include "models.h"
+
+ggml_cgraph * clip_graph_nemotron_v2_vl::build() {
+ GGML_ASSERT(model.class_embedding != nullptr);
+ GGML_ASSERT(model.position_embeddings != nullptr);
+
+ const int n_registers = model.class_embedding->ne[1];
+ const int n_pos = n_patches + n_registers;
+
+ ggml_tensor * inp = build_inp();
+
+ // add position embeddings (pre-downsampled during GGUF conversion for fixed 512x512 input)
+ inp = ggml_add(ctx0, inp, model.position_embeddings);
+ cb(inp, "inp_pos", -1);
+
+ inp = ggml_concat(ctx0, model.class_embedding, inp, 1);
+
+ ggml_tensor * cur = build_vit(inp, n_pos, NORM_TYPE_NORMAL, hparams.ffn_op, nullptr, nullptr);
+
+ cur = ggml_view_2d(ctx0, cur,
+ n_embd, n_patches,
+ ggml_row_size(cur->type, n_embd),
+ n_registers * ggml_row_size(cur->type, n_embd));
+
+ cur = build_patch_merge_permute(cur, model.hparams.n_merge);
+
+ {
+ cur = build_norm(cur, model.mm_0_w, nullptr, NORM_TYPE_RMS, 1e-6, -1);
+ cur = build_ffn(cur, model.mm_1_w, nullptr, nullptr, nullptr, model.mm_3_w, nullptr, FFN_RELU_SQR, -1);
+ }
+
+ ggml_build_forward_expand(gf, cur);
+
+ return gf;
+}