// legacy
bool has_llava_projector = false;
int minicpmv_version = 0;
+ int32_t minicpmv_query_num = 0; // MiniCPM-V query number
};
struct clip_layer {
int n_embd = clip_n_mmproj_embd(ctx);
const int d_head = 128;
int n_head = n_embd/d_head;
- int num_query = 96;
- if (ctx->model.hparams.minicpmv_version == 2) {
- // MiniCPM-V 2.5
- num_query = 96;
- } else if (ctx->model.hparams.minicpmv_version == 3) {
- // MiniCPM-V 2.6
- num_query = 64;
- } else if (ctx->model.hparams.minicpmv_version == 4) {
- // MiniCPM-o 2.6
- num_query = 64;
- } else if (ctx->model.hparams.minicpmv_version == 5) {
- // MiniCPM-V 4.0
- num_query = 64;
- }
-
+ // Use actual config value if available, otherwise fall back to hardcoded values
+ int num_query = ctx->model.hparams.minicpmv_query_num;
ggml_tensor * Q = ggml_add(ctx0,
ggml_mul_mat(ctx0, model.mm_model_attn_q_w, q),
model.mm_model_attn_q_b);
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
-
+ get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false);
+ if (hparams.minicpmv_query_num == 0) {
+ // Fallback to hardcoded values for legacy models
+ if (hparams.minicpmv_version == 3) {
+ hparams.minicpmv_query_num = 64;
+ } else if (hparams.minicpmv_version == 4) {
+ hparams.minicpmv_query_num = 64;
+ } else if (hparams.minicpmv_version == 5) {
+ hparams.minicpmv_query_num = 64;
+ } else {
+ hparams.minicpmv_query_num = 96;
+ }
+ }
} else if (is_audio) {
get_u32(KEY_A_NUM_MEL_BINS, hparams.n_mel_bins);
} break;
case PROJECTOR_TYPE_MINICPMV:
{
- if (params.minicpmv_version == 2) {
- // MiniCPM-V 2.5
- n_patches_sq = 96;
- } else if (params.minicpmv_version == 3) {
- // MiniCPM-V 2.6
- n_patches_sq = 64;
- } else if (params.minicpmv_version == 4) {
- // MiniCPM-o 2.6
- n_patches_sq = 64;
- } else if (params.minicpmv_version == 5) {
- // MiniCPM-V 4.0
- n_patches_sq = 64;
+ // Use actual config value if available, otherwise fall back to hardcoded values
+ if (params.minicpmv_query_num > 0) {
+ n_patches_sq = params.minicpmv_query_num;
} else {
- GGML_ABORT("Unknown minicpmv version");
+ // Fallback to hardcoded values for legacy models
+ if (params.minicpmv_version == 2) {
+ n_patches_sq = 96;
+ } else if (params.minicpmv_version == 3) {
+ n_patches_sq = 64;
+ } else if (params.minicpmv_version == 4) {
+ n_patches_sq = 64;
+ } else if (params.minicpmv_version == 5) {
+ // MiniCPM-V 4.0
+ n_patches_sq = 64;
+ } else {
+ GGML_ABORT("Unknown minicpmv version");
+ }
}
} break;
case PROJECTOR_TYPE_QWEN2VL:
}
int clip_n_mmproj_embd(const struct clip_ctx * ctx) {
- const auto & hparams = ctx->model.hparams;
switch (ctx->model.proj_type) {
case PROJECTOR_TYPE_LDP:
return ctx->model.mm_model_block_1_block_2_1_b->ne[0];
case PROJECTOR_TYPE_MLP_NORM:
return ctx->model.mm_3_b->ne[0];
case PROJECTOR_TYPE_MINICPMV:
- if (hparams.minicpmv_version == 2) {
- // MiniCPM-V 2.5
- return 4096;
- } else if (hparams.minicpmv_version == 3) {
- // MiniCPM-V 2.6
- return 3584;
- } else if (hparams.minicpmv_version == 4) {
- // MiniCPM-o 2.6
- return 3584;
- } else if (hparams.minicpmv_version == 5) {
- // MiniCPM-V 4.0
- return 2560;
- }
- GGML_ABORT("Unknown minicpmv version");
+ return ctx->model.mm_model_proj->ne[0];
case PROJECTOR_TYPE_GLM_EDGE:
return ctx->model.mm_model_mlp_3_w->ne[1];
case PROJECTOR_TYPE_QWEN2VL:
# output in the same directory as the model if output_dir is None
dir_model = args.model_dir
+# Read config.json to get actual model configuration
+config_path = os.path.join(dir_model, "config.json")
+model_config = {}
+if os.path.isfile(config_path):
+ with open(config_path, "r", encoding="utf-8") as f:
+ model_config = json.load(f)
+ print(f"Loaded config from {config_path}")
+else:
+ print(f"Warning: config.json not found at {config_path}")
+
# If minicpmv_projector is not specified but the default path exists, use the default path
if args.minicpmv_projector is None:
default_projector_path = os.path.join(dir_model, "minicpmv.projector")
# processor = CLIPProcessor.from_pretrained(dir_model)
minicpmv_version = args.minicpmv_version
-emb_dim = 4096
-block_count = 26
-if minicpmv_version == 1: # MiniCPM-V 2.0
- emb_dim = 2304
- block_count = 26
-elif minicpmv_version == 2: # MiniCPM-V 2.5
- emb_dim = 4096
- block_count = 27
-elif minicpmv_version == 3: # MiniCPM-V 2.6
- emb_dim = 3584
- block_count = 27
-elif minicpmv_version == 4: # MiniCPM-o 2.6
- emb_dim = 3584
- block_count = 27
-elif minicpmv_version == 5: # MiniCPM-V 4.0
- emb_dim = 2560
- block_count = 27
-
-default_vision_config = {
- "hidden_size": 1152,
- "image_size": 980,
- "intermediate_size": 4304,
- "model_type": "idefics2",
- "num_attention_heads": 16,
- "num_hidden_layers": 27,
- "patch_size": 14,
+
+# Use actual config values instead of hardcoded ones
+if model_config:
+ # For the projector/resampler, use the main model's hidden_size
+ emb_dim = model_config.get("hidden_size", 1536)
+
+ # For the vision model, use vision_config values
+ vision_config_dict = model_config.get("vision_config", {})
+ default_vision_config = {
+ "hidden_size": vision_config_dict.get("hidden_size", 1152),
+ "image_size": vision_config_dict.get("image_size", 980),
+ "intermediate_size": vision_config_dict.get("intermediate_size", 4304),
+ "model_type": vision_config_dict.get("model_type", "siglip"),
+ "num_attention_heads": vision_config_dict.get("num_attention_heads", 16),
+ "num_hidden_layers": vision_config_dict.get("num_hidden_layers", 27),
+ "patch_size": vision_config_dict.get("patch_size", 14),
}
+ # Use vision model's num_hidden_layers for block_count
+ block_count = vision_config_dict.get("num_hidden_layers", 27)
+
+ print(f"Using config values: emb_dim={emb_dim}, block_count={block_count}")
+ print(f"Vision config: {default_vision_config}")
+else:
+ # Fallback to original hardcoded logic if config.json not found
+ emb_dim = 4096
+ block_count = 26
+ if minicpmv_version == 1:
+ emb_dim = 2304
+ block_count = 26
+ elif minicpmv_version == 2:
+ emb_dim = 4096
+ block_count = 27
+ elif minicpmv_version == 3:
+ emb_dim = 3584
+ block_count = 27
+ elif minicpmv_version == 4:
+ emb_dim = 3584
+ block_count = 27
+ elif minicpmv_version == 5:
+ emb_dim = 2560
+ block_count = 27
+
+ default_vision_config = {
+ "hidden_size": 1152,
+ "image_size": 980,
+ "intermediate_size": 4304,
+ "model_type": "idefics2",
+ "num_attention_heads": 16,
+ "num_hidden_layers": 27,
+ "patch_size": 14,
+ }
+
vision_config = Idefics2VisionConfig(**default_vision_config)
model = Idefics2VisionTransformer(vision_config)
-if minicpmv_version == 3:
+if minicpmv_version == 3 or (model_config and model_config.get("vision_config", {}).get("model_type") == "siglip"):
vision_config = SiglipVisionConfig(**default_vision_config)
model = SiglipVisionTransformer(vision_config)
elif minicpmv_version == 4:
fout.add_description("two-tower CLIP model")
if has_vision_encoder:
- # vision_model hparams
- fout.add_uint32("clip.vision.image_size", 448)
- fout.add_uint32("clip.vision.patch_size", 14)
- fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), 1152)
- fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), 4304)
+ # vision_model hparams - use actual config values
+ vision_image_size = model_config.get("image_size", 448) if model_config else 448
+ vision_patch_size = default_vision_config.get("patch_size", 14)
+ vision_hidden_size = default_vision_config.get("hidden_size", 1152)
+ vision_intermediate_size = default_vision_config.get("intermediate_size", 4304)
+ vision_attention_heads = default_vision_config.get("num_attention_heads", 16)
+
+ fout.add_uint32("clip.vision.image_size", vision_image_size)
+ fout.add_uint32("clip.vision.patch_size", vision_patch_size)
+ fout.add_uint32(add_key_str(KEY_EMBEDDING_LENGTH, VISION), vision_hidden_size)
+ fout.add_uint32(add_key_str(KEY_FEED_FORWARD_LENGTH, VISION), vision_intermediate_size)
fout.add_uint32("clip.vision.projection_dim", 0)
- fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), 16)
+ fout.add_uint32(add_key_str(KEY_ATTENTION_HEAD_COUNT, VISION), vision_attention_heads)
fout.add_float32(add_key_str(KEY_ATTENTION_LAYERNORM_EPS, VISION), 1e-6)
fout.add_uint32(add_key_str(KEY_BLOCK_COUNT, VISION), block_count)
+ # Add MiniCPM-V specific parameters
+ query_num = model_config.get("query_num", 0) if model_config else 0
+ resampler_emb_dim = model_config.get("hidden_size", 0) if model_config else 0
+ fout.add_uint32("clip.minicpmv_query_num", query_num)
+
if processor is not None:
image_mean = processor.image_processor.image_mean if args.image_mean is None or args.image_mean == default_image_mean else args.image_mean
image_std = processor.image_processor.image_std if args.image_std is None or args.image_std == default_image_std else args.image_std