mtmd : refactor llava-uhd preprocessing logic (#14247)

author Xuan-Son Nguyen <redacted>

Wed, 18 Jun 2025 08:43:57 +0000 (10:43 +0200)

committer GitHub <redacted>

Wed, 18 Jun 2025 08:43:57 +0000 (10:43 +0200)
author Xuan-Son Nguyen <redacted>
Wed, 18 Jun 2025 08:43:57 +0000 (10:43 +0200)
committer GitHub <redacted>
Wed, 18 Jun 2025 08:43:57 +0000 (10:43 +0200)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp

index c25bacc17769b2564c84cf9155143097772582bf..30283d6f1f032e70a2fe3a4797fedc0566ae6c5b 100644 (file)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -187,7 +187,7 @@ struct clip_hparams {
      float eps = 1e-6;
      float rope_theta = 0.0;
  
-    std::vector<int32_t> image_grid_pinpoints;
+    std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
      int32_t image_crop_resolution;
      std::unordered_set<int32_t> vision_feature_layer;
      int32_t attn_window_size = 0;
@@ -2109,8 +2109,7 @@ struct clip_model_loader {
              if (is_vision) {
                  get_u32(KEY_IMAGE_SIZE, hparams.image_size);
                  get_u32(KEY_PATCH_SIZE, hparams.patch_size);
-                get_u32(KEY_IMAGE_CROP_RESOLUTION,    hparams.image_crop_resolution, false);
-                get_arr_int(KEY_IMAGE_GRID_PINPOINTS, hparams.image_grid_pinpoints, false);
+                get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
                  get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
  
              } else if (is_audio) {
@@ -2120,6 +2119,20 @@ struct clip_model_loader {
                  GGML_ASSERT(false && "unknown modality");
              }
  
+            // for pinpoints, we need to convert it into a list of resolution candidates
+            {
+                std::vector<int> pinpoints;
+                get_arr_int(KEY_IMAGE_GRID_PINPOINTS, pinpoints, false);
+                if (!pinpoints.empty()) {
+                    for (size_t i = 0; i < pinpoints.size(); i += 2) {
+                        hparams.image_res_candidates.push_back({
+                            pinpoints[i],
+                            pinpoints[i+1],
+                        });
+                    }
+                }
+            }
+
              // default warmup value
              hparams.warmup_image_size = hparams.image_size;
  
@@ -2231,16 +2244,7 @@ struct clip_model_loader {
                      {
                          hparams.rope_theta = 10000.0f;
                          get_u32(KEY_PROJ_SCALE_FACTOR, hparams.proj_scale_factor);
-
-                        // borrowed from llava-1.6
-                        const int isize = hparams.image_size;
-                        hparams.image_grid_pinpoints = {
-                            isize,   isize*2, // 336, 672
-                            isize*2, isize,   // 672, 336
-                            isize*2, isize*2, // 672, 672
-                            isize*3, isize,   // 1008, 336
-                            isize,   isize*3, // 336, 1008
-                        };
+                        set_llava_uhd_res_candidates(model, 3);
                      } break;
                  case PROJECTOR_TYPE_ULTRAVOX:
                  case PROJECTOR_TYPE_QWEN2A:
@@ -2674,6 +2678,21 @@ struct clip_model_loader {
              output[i] = values[i];
          }
      }
+
+    void set_llava_uhd_res_candidates(clip_model & model, const int max_patches_per_side) {
+        auto & hparams = model.hparams;
+        for (int x = 1; x <= max_patches_per_side; x++) {
+            for (int y = 1; y <= max_patches_per_side; y++) {
+                if (x == 1 && y == 1) {
+                    continue; // skip the first point
+                }
+                hparams.image_res_candidates.push_back(clip_image_size{
+                    x*hparams.image_size,
+                    y*hparams.image_size,
+                });
+            }
+        }
+    }
  };
  
  struct clip_init_result clip_init(const char * fname, struct clip_context_params ctx_params) {
@@ -3028,36 +3047,41 @@ struct llava_uhd {
          bool padding_refined = false;  // if true, refine image will be padded to the grid size (e.g. llava-1.6)
      };
  
-    static int get_max_slices(struct clip_ctx * ctx) {
-        if (clip_is_minicpmv(ctx)) {
-            return 9;
-        }
-        return 0;
-    }
-
      static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
          slice_instructions res;
          const int patch_size      = clip_get_patch_size(ctx);
          const int slice_size      = clip_get_image_size(ctx);
-        const int max_slice_nums  = get_max_slices(ctx);
          const int original_width  = original_size.width;
          const int original_height = original_size.height;
-        const float log_ratio = log((float)original_width / original_height);
-        const float ratio = (float)original_width * original_height / (slice_size * slice_size);
-        const int multiple = fmin(ceil(ratio), max_slice_nums);
-        const bool has_slices = (multiple > 1);
-        const bool has_pinpoints = !ctx->model.hparams.image_grid_pinpoints.empty();
+
+        const bool has_slices    = original_size.width > slice_size || original_size.height > slice_size;
+        const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
+
+        if (!has_slices) {
+            // skip slicing logic
+            res.overview_size = clip_image_size{slice_size, slice_size};
+            res.refined_size  = clip_image_size{0, 0};
+            res.grid_size     = clip_image_size{0, 0};
+
+            return res;
+        }
  
          if (has_pinpoints) {
              // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
              auto refine_size = llava_uhd::select_best_resolution(
-                ctx->model.hparams.image_grid_pinpoints,
-                original_size);
+                original_size,
+                ctx->model.hparams.image_res_candidates);
              res.overview_size   = clip_image_size{slice_size, slice_size};
              res.refined_size    = refine_size;
              res.grid_size       = clip_image_size{0, 0};
              res.padding_refined = true;
  
+            LOG_DBG("%s: using pinpoints for slicing\n", __func__);
+            LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
+                    __func__, original_width, original_height,
+                    res.overview_size.width, res.overview_size.height,
+                    res.refined_size.width,  res.refined_size.height);
+
              for (int y = 0; y < refine_size.height; y += slice_size) {
                  for (int x = 0; x < refine_size.width; x += slice_size) {
                      slice_coordinates slice;
@@ -3066,13 +3090,16 @@ struct llava_uhd {
                      slice.size.width  = std::min(slice_size, refine_size.width  - x);
                      slice.size.height = std::min(slice_size, refine_size.height - y);
                      res.slices.push_back(slice);
-                    if (x == 0) {
-                        res.grid_size.width++;
-                    }
+                    LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
+                            __func__, (int)res.slices.size() - 1,
+                            slice.x, slice.y, slice.size.width, slice.size.height);
                  }
-                res.grid_size.height++;
              }
  
+            res.grid_size.height = refine_size.height / slice_size;
+            res.grid_size.width  = refine_size.width  / slice_size;
+            LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height);
+
              return res;
          }
  
@@ -3081,17 +3108,23 @@ struct llava_uhd {
          auto best_size    = get_best_resize(original_size, slice_size, patch_size, !has_slices);
          res.overview_size = best_size;
  
-        if (!has_slices) {
-            // skip slicing logic
-            res.refined_size = clip_image_size{0, 0};
-            res.grid_size    = clip_image_size{0, 0};
+        {
+            const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
+            const float log_ratio = log((float)original_width / original_height);
+            const float ratio = (float)original_width * original_height / (slice_size * slice_size);
+            const int multiple = fmin(ceil(ratio), max_slice_nums);
  
-        } else {
              auto best_grid   = get_best_grid(max_slice_nums, multiple, log_ratio);
              auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
              res.grid_size    = best_grid;
              res.refined_size = refine_size;
  
+            LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
+                    __func__, original_width, original_height,
+                    res.overview_size.width, res.overview_size.height,
+                    res.refined_size.width, res.refined_size.height,
+                    res.grid_size.width, res.grid_size.height);
+
              int width  = refine_size.width;
              int height = refine_size.height;
              int grid_x = int(width  / best_grid.width);
@@ -3108,7 +3141,9 @@ struct llava_uhd {
                      slice.size.width  = grid_x;
                      slice.size.height = grid_y;
                      res.slices.push_back(slice);
-                    // LOG_INF("slice %d: %d %d %d %d\n", ic, patches_i, patches_j, grid_x, grid_y);
+                    LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
+                            __func__, (int)res.slices.size() - 1,
+                            slice.x, slice.y, slice.size.width, slice.size.height);
                  }
              }
          }
@@ -3166,48 +3201,55 @@ private:
          return res;
      }
  
+    static clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) {
+        float scale_width  = static_cast<float>(target_max.width)  / orig.width;
+        float scale_height = static_cast<float>(target_max.height) / orig.height;
+        float scale = std::min(scale_width, scale_height);
+        return clip_image_size{
+            static_cast<int>(orig.width  * scale),
+            static_cast<int>(orig.height * scale),
+        };
+    }
+
      /**
       * Selects the best resolution from a list of possible resolutions based on the original size.
       *
+     * For example, when given a list of resolutions:
+     *  - 100x100
+     *  - 200x100
+     *  - 100x200
+     *  - 200x200
+     *
+     * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
+     *
       * @param original_size The original size of the image
       * @param possible_resolutions A list of possible resolutions
       * @return The best fit resolution
       */
      static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
-        int original_width = original_size.width;
-        int original_height = original_size.height;
          clip_image_size best_fit;
+        int min_wasted_area = std::numeric_limits<int>::max();
          int max_effective_resolution = 0;
-        int min_wasted_resolution = std::numeric_limits<int>::max();
-
-        for (const auto & resolution : possible_resolutions) {
-            int width  = resolution.width;
-            int height = resolution.height;
-            float scale = std::min(static_cast<float>(width) / original_width, static_cast<float>(height) / original_height);
-            int downscaled_width  = static_cast<int>(original_width * scale);
-            int downscaled_height = static_cast<int>(original_height * scale);
-            int effective_resolution = std::min(downscaled_width * downscaled_height, original_width * original_height);
-            int wasted_resolution = (width * height) - effective_resolution;
-            // LOG_INF("resolution: %d %d, scale: %f, downscaled: %d %d, effective: %d, wasted: %d\n", width, height, scale, downscaled_width, downscaled_height, effective_resolution, wasted_resolution);
-            if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_resolution < min_wasted_resolution)) {
+
+        for (const clip_image_size & candidate : possible_resolutions) {
+            auto target_size = resize_maintain_aspect_ratio(original_size, candidate);
+            int effective_resolution = std::min(
+                target_size.width * target_size.height,
+                original_size.width * original_size.height);
+            int wasted_area = (candidate.width * candidate.height) - effective_resolution;
+
+            if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) {
                  max_effective_resolution = effective_resolution;
-                min_wasted_resolution = wasted_resolution;
-                best_fit = resolution;
+                min_wasted_area = wasted_area;
+                best_fit = candidate;
              }
+
+            LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution);
          }
  
          return best_fit;
      }
  
-    // used by llava 1.6 with custom list of pinpoints
-    static clip_image_size select_best_resolution(const std::vector<int32_t> & pinpoints, const clip_image_size & original_size) {
-        std::vector<clip_image_size> possible_resolutions; // TODO @ngxson : construct this inside hparams, not here
-        for (size_t i = 0; i < pinpoints.size(); i += 2) {
-            possible_resolutions.push_back(clip_image_size{pinpoints[i], pinpoints[i+1]});
-        }
-        return select_best_resolution(original_size, possible_resolutions);
-    }
-
      static int ensure_divide(int length, int patch_size) {
          return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
      }
@@ -3331,7 +3373,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
          return true;
  
      } else if (ctx->proj_type() == PROJECTOR_TYPE_LLAMA4) {
-        GGML_ASSERT(!params.image_grid_pinpoints.empty());
+        GGML_ASSERT(!params.image_res_candidates.empty());
          auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
          std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
  
@@ -3371,7 +3413,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
          res_imgs->entries.push_back(std::move(res));
          return true;
  
-    } else if (!params.image_grid_pinpoints.empty()) {
+    } else if (!params.image_res_candidates.empty()) {
          // "spatial_unpad" with "anyres" processing for llava-1.6
          auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
          std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
@@ -3431,17 +3473,6 @@ const char * clip_patch_merge_type(const struct clip_ctx * ctx) {
      return ctx->model.hparams.mm_patch_merge_type == PATCH_MERGE_SPATIAL_UNPAD ? "spatial_unpad" : "flat";
  }
  
-const int32_t * clip_image_grid(const struct clip_ctx * ctx) {
-    if (ctx->model.hparams.image_grid_pinpoints.size()) {
-        return &ctx->model.hparams.image_grid_pinpoints.front();
-    }
-    return nullptr;
-}
-
-size_t get_clip_image_grid_size(const struct clip_ctx * ctx) {
-    return ctx->model.hparams.image_grid_pinpoints.size();
-}
-
  int clip_n_output_tokens_x(const struct clip_ctx * ctx, struct clip_image_f32 * img) {
      const auto & params = ctx->model.hparams;
      const int n_total = clip_n_output_tokens(ctx, img);
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h

index cb2eb261fe2e8b5171ca026ca498cacd0d565e26..08f3efb7b1dafaff7aced494a1dc462e034c7904 100644 (file)
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -46,9 +46,6 @@ int32_t clip_get_hidden_size(const struct clip_ctx * ctx);
  // TODO: should be enum, not string
  const char * clip_patch_merge_type(const struct clip_ctx * ctx);
  
-const int32_t * clip_image_grid(const struct clip_ctx * ctx);
-size_t get_clip_image_grid_size(const struct clip_ctx * ctx);
-
  int clip_n_output_tokens(const struct clip_ctx * ctx, struct clip_image_f32 * img);
  
  // for M-RoPE, this will be the number of token positions in X and Y directions
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp

index 8573f11437f1bc6f97affd3861ac75060c4d011c..e3829738338c3dfa88a2cc21d715d2f45b4a4737 100644 (file)
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -501,7 +501,10 @@ struct mtmd_tokenizer {
                  || ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
                  || ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
              ) {
+                const int n_col = batch_f32.grid_x;
+                const int n_row = batch_f32.grid_y;
                  // split batch into chunks of single images
+                // NOTE: batch_f32 will be invalidated after this call
                  auto chunks = split_batch_to_chunk(std::move(batch_f32), bitmap->id);
                  GGML_ASSERT(chunks.size() > 0);
  
@@ -521,8 +524,7 @@ struct mtmd_tokenizer {
  
                  // add slices (or tiles)
                  if (!chunks.empty()) {
-                    const int n_col = batch_f32.grid_x;
-                    const int n_row = batch_f32.grid_y;
+                    GGML_ASSERT((int)chunks.size() == n_row * n_col);
                      if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
                          add_text({ctx->tok_slices_start});
                      }
author	Xuan-Son Nguyen <redacted>
	Wed, 18 Jun 2025 08:43:57 +0000 (10:43 +0200)
committer	GitHub <redacted>
	Wed, 18 Jun 2025 08:43:57 +0000 (10:43 +0200)
tools/mtmd/clip.cpp		patch \| blob \| history
tools/mtmd/clip.h		patch \| blob \| history
tools/mtmd/mtmd.cpp		patch \| blob \| history