mtmd : fix idefics3 preprocessing (#16806)

author Xuan-Son Nguyen <redacted>

Mon, 27 Oct 2025 22:12:16 +0000 (23:12 +0100)

committer GitHub <redacted>

Mon, 27 Oct 2025 22:12:16 +0000 (23:12 +0100)
author Xuan-Son Nguyen <redacted>
Mon, 27 Oct 2025 22:12:16 +0000 (23:12 +0100)
committer GitHub <redacted>
Mon, 27 Oct 2025 22:12:16 +0000 (23:12 +0100)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp

index 08167625302c63ccc339e67eee9bb82937d06f93..b44f0a3a28ad2842b437e809a48c167909af7eb4 100644 (file)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -171,7 +171,7 @@ struct clip_hparams {
      int32_t n_head;
      int32_t n_layer;
      // idefics3
-    int32_t preproc_image_size = 0;
+    int32_t preproc_image_size = 0; // aka max_dimension
      int32_t proj_scale_factor = 0;
  
      float image_mean[3];
@@ -3221,8 +3221,8 @@ struct image_manipulation {
              return {0, 0};
          }
  
-        float scale = std::min(1.0f, std::min(static_cast<float>(max_dimension) / inp_size.width,
-                                              static_cast<float>(max_dimension) / inp_size.height));
+        float scale = std::min(static_cast<float>(max_dimension) / inp_size.width,
+                               static_cast<float>(max_dimension) / inp_size.height);
  
          float target_width_f  = static_cast<float>(inp_size.width)  * scale;
          float target_height_f = static_cast<float>(inp_size.height) * scale;
@@ -3385,7 +3385,7 @@ struct llava_uhd {
  
          // resize to overview size
          clip_image_u8_ptr resized_img(clip_image_u8_init());
-        image_manipulation::bicubic_resize(*img, *resized_img, inst.overview_size.width, inst.overview_size.height);
+        image_manipulation::resize_and_pad_image(*img, *resized_img, inst.overview_size);
          output.push_back(std::move(resized_img));
          if (inst.slices.empty()) {
              // no slices, just return the resized image
@@ -3587,6 +3587,9 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
          // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
          const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio(
              original_size, params.image_size, params.preproc_image_size);
+        // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
+        //         __func__, original_size.width, original_size.height,
+        //         refined_size.width, refined_size.height);
  
          llava_uhd::slice_instructions instructions;
          instructions.overview_size = clip_image_size{params.image_size, params.image_size};
@@ -3597,6 +3600,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
          };
          for (int y = 0; y < refined_size.height; y += params.image_size) {
              for (int x = 0; x < refined_size.width; x += params.image_size) {
+                // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y);
                  instructions.slices.push_back(llava_uhd::slice_coordinates{
                      /* x    */x,
                      /* y    */y,
diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh

index 5e33d127649a0bbd1993bf919a9ed9924b7c74b4..c2270746360ec35eed3f0a6c69d86a92feccce36 100755 (executable)
--- a/tools/mtmd/tests.sh
+++ b/tools/mtmd/tests.sh
@@ -139,7 +139,10 @@ for i in "${!arr_hf[@]}"; do
  
      echo "$output" > $SCRIPT_DIR/output/$bin-$(echo "$hf" | tr '/' '-').log
  
-    if echo "$output" | grep -iq "new york"; then
+    # either contains "new york" or both "men" and "walk"
+    if echo "$output" | grep -iq "new york" \
+            || (echo "$output" | grep -iq "men" && echo "$output" | grep -iq "walk")
+    then
          result="$prefix \033[32mOK\033[0m:   $bin $hf"
      else
          result="$prefix \033[31mFAIL\033[0m: $bin $hf"
author	Xuan-Son Nguyen <redacted>
	Mon, 27 Oct 2025 22:12:16 +0000 (23:12 +0100)
committer	GitHub <redacted>
	Mon, 27 Oct 2025 22:12:16 +0000 (23:12 +0100)
tools/mtmd/clip.cpp		patch \| blob \| history
tools/mtmd/tests.sh		patch \| blob \| history