@ModelBase.register("InternVisionModel")
class InternVisionModel(MmprojModel):
+
+ min_dynamic_tiles: int = 0
+ max_dynamic_tiles: int = 0
+
+ def __init__(self, *args, **kwargs):
+ super().__init__(*args, **kwargs)
+ assert self.hparams_vision is not None
+ self.min_dynamic_tiles = self.global_config.get("min_dynamic_patch", 0)
+ self.max_dynamic_tiles = self.global_config.get("max_dynamic_patch", 0)
+
def set_gguf_parameters(self):
assert self.hparams_vision is not None
if isinstance(self.hparams_vision['image_size'], list):
downsample_ratio = self.global_config.get("downsample_ratio")
assert downsample_ratio is not None
self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
+ # older models may not have min/max_dynamic_patch in config
+ if self.min_dynamic_tiles > 0:
+ self.gguf_writer.add_vision_preproc_min_tiles(self.min_dynamic_tiles)
+ if self.max_dynamic_tiles > 0:
+ self.gguf_writer.add_vision_preproc_max_tiles(self.max_dynamic_tiles)
def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".position_embd." in new_name:
IMAGE_SIZE = "clip.vision.image_size"
IMAGE_MIN_PIXELS = "clip.vision.image_min_pixels"
IMAGE_MAX_PIXELS = "clip.vision.image_max_pixels"
+ PREPROC_MIN_TILES = "clip.vision.preproc_min_tiles"
+ PREPROC_MAX_TILES = "clip.vision.preproc_max_tiles"
PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size"
PATCH_SIZE = "clip.vision.patch_size"
EMBEDDING_LENGTH = "clip.vision.embedding_length"
def add_vision_min_pixels(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.IMAGE_MIN_PIXELS, value)
+ def add_vision_preproc_max_tiles(self, value: int) -> None:
+ self.add_uint32(Keys.ClipVision.PREPROC_MAX_TILES, value)
+
+ def add_vision_preproc_min_tiles(self, value: int) -> None:
+ self.add_uint32(Keys.ClipVision.PREPROC_MIN_TILES, value)
+
def add_vision_preproc_image_size(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value)
#define KEY_IMAGE_SIZE "clip.vision.image_size"
#define KEY_IMAGE_MIN_PIXELS "clip.vision.image_min_pixels"
#define KEY_IMAGE_MAX_PIXELS "clip.vision.image_max_pixels"
+#define KEY_PREPROC_MIN_TILES "clip.vision.preproc_min_tiles"
+#define KEY_PREPROC_MAX_TILES "clip.vision.preproc_max_tiles"
#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
#define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
int32_t image_max_pixels = -1;
int32_t n_merge = 0; // number of patch merges **per-side**
+ int32_t preproc_min_tiles = 0;
+ int32_t preproc_max_tiles = 0;
+
float image_mean[3];
float image_std[3];
}
} break;
case PROJECTOR_TYPE_INTERNVL:
+ {
+ // older version of internvl doesn't have min/max tiles, we need to provide default values for them to avoid issues
+ hparams.preproc_min_tiles = 1;
+ hparams.preproc_max_tiles = 12;
+ get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+ get_u32(KEY_PREPROC_MIN_TILES, hparams.preproc_min_tiles, false);
+ get_u32(KEY_PREPROC_MAX_TILES, hparams.preproc_max_tiles, false);
+ GGML_ASSERT(hparams.preproc_min_tiles <= hparams.preproc_max_tiles && hparams.preproc_max_tiles < INT32_MAX);
+ set_internvl_dhr_res_candidates(model);
+ } break;
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
{
get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
}
}
}
+
+ static void set_internvl_dhr_res_candidates(clip_model & model) {
+ auto & hparams = model.hparams;
+ int min_num = hparams.preproc_min_tiles;
+ int max_num = hparams.preproc_max_tiles;
+ if (min_num < 1) {
+ return; // avoid divide by 0
+ }
+ for (int a = min_num; a <= max_num; ++a) {
+ int b_lo = (min_num + a - 1) / a;
+ int b_hi = max_num / a;
+ b_lo = std::max(b_lo, min_num);
+ b_hi = std::min(b_hi, max_num);
+ for (int b = b_lo; b <= b_hi; ++b) {
+ hparams.image_res_candidates.push_back(clip_image_size {
+ a*hparams.image_size,
+ b*hparams.image_size,
+ });
+ }
+ }
+ }
};
struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
return res;
}
- static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
+ static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst, bool overview_first = true) {
std::vector<clip_image_u8_ptr> output;
// resize to overview size
clip_image_u8_ptr resized_img(clip_image_u8_init());
img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview,
inst.padding_overview, inst.pad_color_overview);
- output.push_back(std::move(resized_img));
+ if (overview_first) {
+ output.push_back(std::move(resized_img));
+ }
if (inst.slices.empty()) {
// no slices, just return the resized image
+ if (!overview_first) {
+ output.push_back(std::move(resized_img));
+ }
return output;
}
output.push_back(std::move(img_slice));
}
+ if (!overview_first) {
+ output.push_back(std::move(resized_img));
+ }
+
return output;
}
res_imgs->grid_x = instructions.grid_size.width;
res_imgs->grid_y = instructions.grid_size.height;
} break;
+ case PROJECTOR_TYPE_INTERNVL: // support dynamic high-resolution
+ {
+ GGML_ASSERT(!params.image_res_candidates.empty());
+ auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
+ std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst, false);
+ for (size_t i = 0; i < imgs.size(); ++i) {
+ clip_image_f32_ptr res(clip_image_f32_init());
+ normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+ res_imgs->entries.push_back(std::move(res));
+ }
+ } break;
case PROJECTOR_TYPE_GLM_EDGE:
case PROJECTOR_TYPE_GEMMA3:
- case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
case PROJECTOR_TYPE_NEMOTRON_V2_VL:
{
clip_image_u8 resized_image;
LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
return 1;
}
+ auto proj_type = clip_get_projector_type(ctx_clip);
int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
bool ok = false;
if (clip_is_llava(ctx_clip)
|| clip_is_minicpmv(ctx_clip)
- || clip_is_glm(ctx_clip)) {
+ || clip_is_glm(ctx_clip)
+ || proj_type == PROJECTOR_TYPE_INTERNVL) {
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
const auto & entries = image_tokens->batch_f32.entries;
for (size_t i = 0; i < entries.size(); i++) {