return [(self.map_tensor_name(name), data_torch)]
+
+@ModelBase.register("JanusForConditionalGeneration")
+class JanusProModel(LlamaModel):
+ model_arch = gguf.MODEL_ARCH.LLAMA # reuse Llama arch
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ # Skip vision, aligner, and generation tensors
+ skip_prefixes = (
+ 'model.vision_model.',
+ 'model.aligner.',
+ 'model.vqmodel.',
+ 'model.generation_embeddings.',
+ 'model.generation_aligner.',
+ 'model.generation_head.',
+ )
+ if name.startswith(skip_prefixes):
+ return []
+
+ if name.startswith('model.language_model.'):
+ name = name.replace('model.language_model.', 'model.')
+ elif name.startswith('language_model.'):
+ name = name.replace('language_model.', '')
+
+ return super().modify_tensors(data_torch, name, bid)
+
+
+@ModelBase.register("JanusForConditionalGeneration")
+class JanusProVisionModel(MmprojModel):
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ assert self.hparams_vision is not None
+ if "intermediate_size" not in self.hparams_vision:
+ mlp_ratio = self.hparams_vision.get("mlp_ratio")
+ hidden_size = self.hparams_vision.get("hidden_size")
+ if mlp_ratio is not None and hidden_size is not None:
+ self.hparams_vision["intermediate_size"] = int(round(hidden_size * mlp_ratio))
+
+ def set_gguf_parameters(self):
+ super().set_gguf_parameters()
+ assert self.hparams_vision is not None
+
+ self.gguf_writer.add_clip_projector_type(gguf.VisionProjectorType.JANUS_PRO)
+
+ self.gguf_writer.add_vision_attention_layernorm_eps(self.hparams_vision.get("layer_norm_eps", 1e-6))
+
+ hidden_act = str(self.hparams_vision.get("hidden_act", "")).lower()
+ if hidden_act == "gelu":
+ self.gguf_writer.add_vision_use_gelu(True)
+ elif hidden_act == "silu":
+ self.gguf_writer.add_vision_use_silu(True)
+
+ def _map_aligner_tensor(self, data_torch: Tensor, name: str) -> Iterable[tuple[str, Tensor]]:
+ """Map aligner tensors to projector format"""
+ suffix = ".bias" if name.endswith(".bias") else ".weight"
+
+ if name.startswith("model.aligner."):
+ local_name = name[len("model.aligner."):]
+ elif name.startswith("aligner."):
+ local_name = name[len("aligner."):]
+ else:
+ raise ValueError(f"Unsupported Janus aligner prefix: {name}")
+
+ if local_name.startswith("fc1."):
+ mm_index = 0
+ elif local_name.startswith("hidden_layers."):
+ parts = local_name.split(".", 2)
+ if len(parts) < 3:
+ raise ValueError(f"Unexpected Janus aligner tensor name: {name}")
+ mm_index = int(parts[1]) + 1
+ else:
+ raise ValueError(f"Unsupported Janus aligner tensor: {name}")
+
+ tensor_name = self.format_tensor_name(gguf.MODEL_TENSOR.V_MMPROJ, mm_index, suffix=suffix)
+ return [(tensor_name, data_torch)]
+
+ def modify_tensors(self, data_torch: Tensor, name: str, bid: int | None) -> Iterable[tuple[str, Tensor]]:
+ del bid # unused
+
+ # Skip language model tensors as they will be handled by `JanusProModel`
+ if name.startswith(('model.language_model.', 'language_model.')):
+ return []
+
+ # Skip generation-related components
+ skip_generation_prefixes = (
+ 'model.vqmodel.',
+ 'vqmodel.',
+ 'model.generation_embeddings.',
+ 'generation_embeddings.',
+ 'model.generation_aligner.',
+ 'generation_aligner.',
+ 'model.generation_head.',
+ 'generation_head.',
+ )
+ if name.startswith(skip_generation_prefixes):
+ return []
+
+ # Handle aligner tensors
+ if name.startswith(('model.aligner.', 'aligner.')):
+ return list(self._map_aligner_tensor(data_torch, name))
+
+ # Handle vision tensors
+ if name.startswith(('model.vision_model.', 'vision_model.')):
+ return [(self.map_tensor_name(name), data_torch)]
+
+ return []
+
+
###### CONVERSION LOGIC ######
KIMIVL = "kimivl"
LIGHTONOCR = "lightonocr"
COGVLM = "cogvlm"
+ JANUS_PRO = "janus_pro"
# Items here are (block size, type size)
"model.mm_projector.mlp.mlp.{bid}",
"vision_model.vision_adapter.mlp.fc{bid}", # llama 4
"mlp1.{bid}", # InternVL
+ "model.aligner.fc1.hidden_layers.{bid}", # Janus Pro
),
MODEL_TENSOR.V_MMPROJ_PEG: (
"model.vision_tower.encoder.layer.{bid}.attention.projection_layer", # Intern-S1
"vpm.encoder.layers.{bid}.self_attn.out_proj",
"model.vision_model.encoder.layers.{bid}.self_attn.out_proj", # SmolVLM
+ "model.vision_model.encoder.layers.{bid}.self_attn.projection_layer", # Janus Pro
"vision_model.model.layers.{bid}.self_attn.o_proj", # llama4
"vision_tower.transformer.layers.{bid}.attention.o_proj", # pixtral-hf
"vision_encoder.transformer.layers.{bid}.attention.wo", # pixtral
PROJECTOR_TYPE_KIMIVL,
PROJECTOR_TYPE_LIGHTONOCR,
PROJECTOR_TYPE_COGVLM,
+ PROJECTOR_TYPE_JANUS_PRO,
PROJECTOR_TYPE_UNKNOWN,
};
{ PROJECTOR_TYPE_KIMIVL, "kimivl"},
{ PROJECTOR_TYPE_LIGHTONOCR,"lightonocr"},
{ PROJECTOR_TYPE_COGVLM, "cogvlm"},
+ { PROJECTOR_TYPE_JANUS_PRO, "janus_pro"},
};
static projector_type clip_projector_type_from_string(const std::string & str) {
cur = ggml_gelu(ctx0, cur);
cur = ggml_mul_mat(ctx0, model.mm_2_w, cur);
cur = ggml_add(ctx0, cur, model.mm_2_b);
+
+ } else if (ctx->proj_type() == PROJECTOR_TYPE_JANUS_PRO) {
+ cur = build_ffn(cur,
+ model.mm_0_w, model.mm_0_b,
+ nullptr, nullptr,
+ model.mm_1_w, model.mm_1_b,
+ hparams.ffn_op,
+ -1);
+
} else {
GGML_ABORT("SigLIP: Unsupported projector type");
}
return gf;
}
-
// whisper encoder with custom projector
ggml_cgraph * build_whisper_enc() {
const int n_frames = img.nx;
{
res = graph.build_kimivl();
} break;
+ case PROJECTOR_TYPE_JANUS_PRO:
+ {
+ res = graph.build_siglip();
+ } break;
case PROJECTOR_TYPE_COGVLM:
{
res = graph.build_cogvlm();
model.mm_boi = get_tensor(TN_TOK_BOI);
model.mm_eoi = get_tensor(TN_TOK_EOI);
} break;
+ case PROJECTOR_TYPE_JANUS_PRO:
+ {
+ model.mm_0_w = get_tensor(string_format(TN_LLAVA_PROJ, 0, "weight"));
+ model.mm_0_b = get_tensor(string_format(TN_LLAVA_PROJ, 0, "bias"));
+ model.mm_1_w = get_tensor(string_format(TN_LLAVA_PROJ, 1, "weight"));
+ model.mm_1_b = get_tensor(string_format(TN_LLAVA_PROJ, 1, "bias"));
+ } break;
default:
GGML_ASSERT(false && "unknown projector type");
}
res_imgs->entries.push_back(std::move(img_f32));
} break;
+ case PROJECTOR_TYPE_JANUS_PRO:
+ {
+ // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
+ const std::array<uint8_t, 3> pad_color = {127, 127, 127};
+ clip_image_u8 resized_image;
+ int sz = params.image_size;
+ img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
+ clip_image_f32_ptr img_f32(clip_image_f32_init());
+ normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
+ res_imgs->entries.push_back(std::move(img_f32));
+ } break;
+
case PROJECTOR_TYPE_PIXTRAL:
case PROJECTOR_TYPE_LIGHTONOCR:
{
switch (proj) {
case PROJECTOR_TYPE_MLP:
case PROJECTOR_TYPE_MLP_NORM:
+ case PROJECTOR_TYPE_JANUS_PRO:
{
// do nothing
} break;
case PROJECTOR_TYPE_ULTRAVOX:
case PROJECTOR_TYPE_LFM2:
case PROJECTOR_TYPE_VOXTRAL:
+ case PROJECTOR_TYPE_JANUS_PRO:
case PROJECTOR_TYPE_COGVLM:
{
// do nothing
return ctx->model.mm_model_mlp_3_w->ne[1];
case PROJECTOR_TYPE_QWEN2VL:
case PROJECTOR_TYPE_QWEN25VL:
+ case PROJECTOR_TYPE_JANUS_PRO:
return ctx->model.mm_1_b->ne[0];
case PROJECTOR_TYPE_QWEN3VL:
// main path + deepstack paths