mtmd: Add dynamic high-resolution image preprocessing for InternVL model (#20847)

author bssrdf <redacted>

Mon, 23 Mar 2026 00:06:30 +0000 (20:06 -0400)

committer GitHub <redacted>

Mon, 23 Mar 2026 00:06:30 +0000 (01:06 +0100)
author bssrdf <redacted>
Mon, 23 Mar 2026 00:06:30 +0000 (20:06 -0400)
committer GitHub <redacted>
Mon, 23 Mar 2026 00:06:30 +0000 (01:06 +0100)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index dba190b48065f2904b10580a3cb0fc038c461b67..0cd47645d3a60683278e91252ab5ad54f684936e 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4273,6 +4273,16 @@ class Qwen25OmniModel(Qwen2VLVisionModel):
  
  @ModelBase.register("InternVisionModel")
  class InternVisionModel(MmprojModel):
+
+    min_dynamic_tiles: int = 0
+    max_dynamic_tiles: int = 0
+
+    def __init__(self, *args, **kwargs):
+        super().__init__(*args, **kwargs)
+        assert self.hparams_vision is not None
+        self.min_dynamic_tiles = self.global_config.get("min_dynamic_patch", 0)
+        self.max_dynamic_tiles = self.global_config.get("max_dynamic_patch", 0)
+
      def set_gguf_parameters(self):
          assert self.hparams_vision is not None
          if isinstance(self.hparams_vision['image_size'], list):
@@ -4295,6 +4305,11 @@ class InternVisionModel(MmprojModel):
          downsample_ratio = self.global_config.get("downsample_ratio")
          assert downsample_ratio is not None
          self.gguf_writer.add_vision_projector_scale_factor(int(1.0 / downsample_ratio))
+        # older models may not have min/max_dynamic_patch in config
+        if self.min_dynamic_tiles > 0:
+            self.gguf_writer.add_vision_preproc_min_tiles(self.min_dynamic_tiles)
+        if self.max_dynamic_tiles > 0:
+            self.gguf_writer.add_vision_preproc_max_tiles(self.max_dynamic_tiles)
  
      def tensor_force_quant(self, name, new_name, bid, n_dims):
          if ".position_embd." in new_name:
diff --git a/gguf-py/gguf/constants.py b/gguf-py/gguf/constants.py

index c5f92c77004916fcc861bee93b1a31d54c0a4dc5..9383644abfc751b0d311529aba145fa9ed67bd56 100644 (file)
--- a/gguf-py/gguf/constants.py
+++ b/gguf-py/gguf/constants.py
@@ -301,6 +301,8 @@ class Keys:
          IMAGE_SIZE          = "clip.vision.image_size"
          IMAGE_MIN_PIXELS    = "clip.vision.image_min_pixels"
          IMAGE_MAX_PIXELS    = "clip.vision.image_max_pixels"
+        PREPROC_MIN_TILES   = "clip.vision.preproc_min_tiles"
+        PREPROC_MAX_TILES   = "clip.vision.preproc_max_tiles"
          PREPROC_IMAGE_SIZE  = "clip.vision.preproc_image_size"
          PATCH_SIZE          = "clip.vision.patch_size"
          EMBEDDING_LENGTH    = "clip.vision.embedding_length"
diff --git a/gguf-py/gguf/gguf_writer.py b/gguf-py/gguf/gguf_writer.py

index 5f653d386d0ea651ca86a80e950a698027b37a7a..010dfeea1c222142c499ca749a67d699bc859abc 100644 (file)
--- a/gguf-py/gguf/gguf_writer.py
+++ b/gguf-py/gguf/gguf_writer.py
@@ -1156,6 +1156,12 @@ class GGUFWriter:
      def add_vision_min_pixels(self, value: int) -> None:
          self.add_uint32(Keys.ClipVision.IMAGE_MIN_PIXELS, value)
  
+    def add_vision_preproc_max_tiles(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.PREPROC_MAX_TILES, value)
+
+    def add_vision_preproc_min_tiles(self, value: int) -> None:
+        self.add_uint32(Keys.ClipVision.PREPROC_MIN_TILES, value)
+
      def add_vision_preproc_image_size(self, value: int) -> None:
          self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value)
  
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h

index 3eb66f91454c4b6ae6fc7bc9527a49d9a3eaeb5e..bf55cec7efc4b400fd5156deee7dbfcab5802834 100644 (file)
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -38,6 +38,8 @@
  #define KEY_IMAGE_SIZE          "clip.vision.image_size"
  #define KEY_IMAGE_MIN_PIXELS    "clip.vision.image_min_pixels"
  #define KEY_IMAGE_MAX_PIXELS    "clip.vision.image_max_pixels"
+#define KEY_PREPROC_MIN_TILES   "clip.vision.preproc_min_tiles"
+#define KEY_PREPROC_MAX_TILES   "clip.vision.preproc_max_tiles"
  #define KEY_PREPROC_IMAGE_SIZE  "clip.vision.preproc_image_size"
  #define KEY_PATCH_SIZE          "clip.vision.patch_size"
  #define KEY_IMAGE_MEAN          "clip.vision.image_mean"
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h

index eeb8da58e0829160c1e36ac00266a5977bdc5805..265a17130f3e36715e5bc1fdadc3a2d7ebc9483f 100644 (file)
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -42,6 +42,9 @@ struct clip_hparams {
      int32_t image_max_pixels = -1;
      int32_t n_merge = 0; // number of patch merges **per-side**
  
+    int32_t preproc_min_tiles = 0;
+    int32_t preproc_max_tiles = 0;
+
      float image_mean[3];
      float image_std[3];
  
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp

index 5fcc7c5b592151ca6d25e3f548235f2ab32e9702..a47f1f495d2f195e234e81cb0b76d819e069d5db 100644 (file)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1138,6 +1138,16 @@ struct clip_model_loader {
                          }
                      } break;
                  case PROJECTOR_TYPE_INTERNVL:
+                    {
+                        // older version of internvl doesn't have min/max tiles, we need to provide default values for them to avoid issues
+                        hparams.preproc_min_tiles = 1;
+                        hparams.preproc_max_tiles = 12;
+                        get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
+                        get_u32(KEY_PREPROC_MIN_TILES, hparams.preproc_min_tiles, false);
+                        get_u32(KEY_PREPROC_MAX_TILES, hparams.preproc_max_tiles, false);
+                        GGML_ASSERT(hparams.preproc_min_tiles <= hparams.preproc_max_tiles && hparams.preproc_max_tiles < INT32_MAX);
+                        set_internvl_dhr_res_candidates(model);
+                    } break;
                  case PROJECTOR_TYPE_NEMOTRON_V2_VL:
                      {
                          get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
@@ -2188,6 +2198,27 @@ struct clip_model_loader {
              }
          }
      }
+
+    static void set_internvl_dhr_res_candidates(clip_model & model) {
+        auto & hparams = model.hparams;
+        int min_num = hparams.preproc_min_tiles;
+        int max_num = hparams.preproc_max_tiles;
+        if (min_num < 1) {
+           return; // avoid  divide by 0
+        }
+        for (int a = min_num; a <= max_num; ++a) {
+            int b_lo = (min_num + a - 1) / a;
+            int b_hi = max_num / a;
+            b_lo = std::max(b_lo, min_num);
+            b_hi = std::min(b_hi, max_num);
+            for (int b = b_lo; b <= b_hi; ++b) {
+                hparams.image_res_candidates.push_back(clip_image_size {
+                    a*hparams.image_size,
+                    b*hparams.image_size,
+                });
+            }
+        }
+    }
  };
  
  struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
@@ -2734,17 +2765,22 @@ struct llava_uhd {
          return res;
      }
  
-    static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst) {
+    static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst, bool overview_first = true) {
          std::vector<clip_image_u8_ptr> output;
  
          // resize to overview size
          clip_image_u8_ptr resized_img(clip_image_u8_init());
          img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview,
                           inst.padding_overview, inst.pad_color_overview);
-        output.push_back(std::move(resized_img));
+        if (overview_first) {
+            output.push_back(std::move(resized_img));
+        }
  
          if (inst.slices.empty()) {
              // no slices, just return the resized image
+            if (!overview_first) {
+                output.push_back(std::move(resized_img));
+            }
              return output;
          }
  
@@ -2765,6 +2801,10 @@ struct llava_uhd {
              output.push_back(std::move(img_slice));
          }
  
+        if (!overview_first) {
+            output.push_back(std::move(resized_img));
+        }
+
          return output;
      }
  
@@ -3149,10 +3189,20 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
                  res_imgs->grid_x = instructions.grid_size.width;
                  res_imgs->grid_y = instructions.grid_size.height;
              } break;
+        case PROJECTOR_TYPE_INTERNVL: // support dynamic high-resolution
+            {
+                GGML_ASSERT(!params.image_res_candidates.empty());
+                auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
+                std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst, false);
  
+                for (size_t i = 0; i < imgs.size(); ++i) {
+                    clip_image_f32_ptr res(clip_image_f32_init());
+                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+                    res_imgs->entries.push_back(std::move(res));
+                }
+            } break;
          case PROJECTOR_TYPE_GLM_EDGE:
          case PROJECTOR_TYPE_GEMMA3:
-        case PROJECTOR_TYPE_INTERNVL: // TODO @ngxson : support dynamic resolution
          case PROJECTOR_TYPE_NEMOTRON_V2_VL:
              {
                  clip_image_u8 resized_image;
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp

index f66c07345ed3dc424069dca612aaa7abcf4092c3..456ce7b73c82556ed45164b802010e9247a3ef41 100644 (file)
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -851,13 +851,15 @@ int32_t mtmd_encode(mtmd_context * ctx, const mtmd_image_tokens * image_tokens)
          LOG_ERR("%s: this API does not support non-vision input, please use mtmd_encode_chunk instead\n", __func__);
          return 1;
      }
+    auto proj_type = clip_get_projector_type(ctx_clip);
      int n_mmproj_embd = clip_n_mmproj_embd(ctx_clip);
      ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
      bool ok = false;
  
      if (clip_is_llava(ctx_clip)
          || clip_is_minicpmv(ctx_clip)
-        || clip_is_glm(ctx_clip)) {
+        || clip_is_glm(ctx_clip)
+        || proj_type == PROJECTOR_TYPE_INTERNVL) {
          // TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
          const auto & entries = image_tokens->batch_f32.entries;
          for (size_t i = 0; i < entries.size(); i++) {
author	bssrdf <redacted>
	Mon, 23 Mar 2026 00:06:30 +0000 (20:06 -0400)
committer	GitHub <redacted>
	Mon, 23 Mar 2026 00:06:30 +0000 (01:06 +0100)
convert_hf_to_gguf.py		patch \| blob \| history
gguf-py/gguf/constants.py		patch \| blob \| history
gguf-py/gguf/gguf_writer.py		patch \| blob \| history
tools/mtmd/clip-impl.h		patch \| blob \| history
tools/mtmd/clip-model.h		patch \| blob \| history
tools/mtmd/clip.cpp		patch \| blob \| history
tools/mtmd/mtmd.cpp		patch \| blob \| history