#define KEY_LAYER_NORM_EPS "clip.%s.attention.layer_norm_epsilon"
// vision-specific
+#define KEY_VISION_PROJ_TYPE "clip.vision.projector_type" // for models with mixed modalities
#define KEY_IMAGE_SIZE "clip.vision.image_size"
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
#define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_MINICPMV_QUERY_NUM "clip.minicpmv_query_num"
// audio-specific
+#define KEY_AUDIO_PROJ_TYPE "clip.audio.projector_type" // for models with mixed modalities
#define KEY_A_NUM_MEL_BINS "clip.audio.num_mel_bins"
#define KEY_A_PROJ_STACK_FACTOR "clip.audio.projector.stack_factor"
// projector type
std::string proj_type;
{
+ // default key
get_string(KEY_PROJ_TYPE, proj_type, false);
- if (!proj_type.empty()) {
- model.proj_type = clip_projector_type_from_string(proj_type);
+
+ // for models with mixed modalities
+ if (proj_type.empty()) {
+ if (modality == CLIP_MODALITY_VISION) {
+ get_string(KEY_VISION_PROJ_TYPE, proj_type, false);
+ } else if (modality == CLIP_MODALITY_AUDIO) {
+ get_string(KEY_AUDIO_PROJ_TYPE, proj_type, false);
+ } else {
+ GGML_ABORT("unknown modality");
+ }
}
+
+ model.proj_type = clip_projector_type_from_string(proj_type);
+
if (model.proj_type == PROJECTOR_TYPE_UNKNOWN) {
throw std::runtime_error(string_format("%s: unknown projector type: %s\n", __func__, proj_type.c_str()));
}
- // correct arch for multimodal models
+ // correct arch for multimodal models (legacy method)
if (model.proj_type == PROJECTOR_TYPE_QWEN25O) {
model.proj_type = modality == CLIP_MODALITY_VISION
? PROJECTOR_TYPE_QWEN25VL