mtmd: refactor image preprocessing (#21031)

author Xuan-Son Nguyen <redacted>

Thu, 26 Mar 2026 18:49:20 +0000 (19:49 +0100)

committer GitHub <redacted>

Thu, 26 Mar 2026 18:49:20 +0000 (19:49 +0100)
author Xuan-Son Nguyen <redacted>
Thu, 26 Mar 2026 18:49:20 +0000 (19:49 +0100)
committer GitHub <redacted>
Thu, 26 Mar 2026 18:49:20 +0000 (19:49 +0100)
diff --git a/tools/mtmd/CMakeLists.txt b/tools/mtmd/CMakeLists.txt

index 71ad149ad3b01b910de432c221b62ca1330ab6dc..b3cf15f9ecc6b12edbf2a4ac78ea284163cf7a24 100644 (file)
--- a/tools/mtmd/CMakeLists.txt
+++ b/tools/mtmd/CMakeLists.txt
@@ -5,6 +5,7 @@ find_package(Threads REQUIRED)
  add_library(mtmd
              mtmd.cpp
              mtmd-audio.cpp
+            mtmd-image.cpp
              mtmd.h
              mtmd-helper.cpp
              mtmd-helper.h
diff --git a/tools/mtmd/clip-impl.h b/tools/mtmd/clip-impl.h

index 4bf34e65bcdd7530445f18c55879258f3c5eb1ff..011d76bcf68b6c219d0769c69805d264132132a8 100644 (file)
--- a/tools/mtmd/clip-impl.h
+++ b/tools/mtmd/clip-impl.h
@@ -51,7 +51,6 @@
  
  #define KEY_MM_PATCH_MERGE_TYPE    "clip.vision.mm_patch_merge_type"
  #define KEY_IMAGE_GRID_PINPOINTS   "clip.vision.image_grid_pinpoints"
-#define KEY_IMAGE_CROP_RESOLUTION  "clip.vision.image_crop_resolution"
  #define KEY_WIN_ATTN_PATTERN       "clip.vision.n_wa_pattern"
  #define KEY_WIN_ATTN_LAYER_INDEXES "clip.vision.wa_layer_indexes"
  #define KEY_ATTN_WINDOW_SIZE       "clip.vision.window_size"
diff --git a/tools/mtmd/clip-model.h b/tools/mtmd/clip-model.h

index e9c454fe69ab8cdacddd35219aaf15a1517714db..a73e9ba38b2820ea237a17e2b6dae3b6b472421a 100644 (file)
--- a/tools/mtmd/clip-model.h
+++ b/tools/mtmd/clip-model.h
@@ -28,6 +28,13 @@ enum patch_merge_type {
      PATCH_MERGE_SPATIAL_UNPAD,
  };
  
+enum resize_algo {
+    RESIZE_ALGO_BILINEAR, // stretch to target resolution
+    RESIZE_ALGO_BICUBIC, // center-crop when aspect ratio doesn't match
+    RESIZE_ALGO_BICUBIC_PILLOW,
+    // RESIZE_ALGO_LANCZOS, // TODO
+};
+
  struct clip_hparams {
      int32_t image_size = 0;
      int32_t patch_size = 0;
@@ -37,13 +44,26 @@ struct clip_hparams {
      int32_t n_head = 0;
      int32_t n_layer = 0;
      // idefics3
+    int32_t n_merge = 0; // number of patch merges **per-side**
+
+    // for preprocessor
      int32_t image_longest_edge = 0;
      int32_t image_min_pixels = -1;
      int32_t image_max_pixels = -1;
-    int32_t n_merge = 0; // number of patch merges **per-side**
+    resize_algo image_resize_algo = RESIZE_ALGO_BICUBIC;
+    bool image_resize_pad = true; // if false, center-crop will be applied when resizing
+    std::array<uint8_t, 3> image_pad_color = {0, 0, 0};
  
+    // (preprocessor) for llava-uhd style models
+    std::vector<clip_image_size> image_res_candidates;
      int32_t preproc_min_tiles = 0;
      int32_t preproc_max_tiles = 0;
+    resize_algo image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
+    resize_algo image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
+    bool image_pad_rf = true;  // if true, refined image will be padded (e.g. llava-1.6)
+    bool image_pad_ov = false; // if true, overview image will be padded (e.g. llava-1.6)
+    std::array<uint8_t, 3> image_pad_color_rf = {0, 0, 0}; // padding color for refined image
+    std::array<uint8_t, 3> image_pad_color_ov = {0, 0, 0}; // padding color for overview image
  
      float image_mean[3];
      float image_std[3];
@@ -60,8 +80,6 @@ struct clip_hparams {
      float eps = 1e-6;
      float rope_theta = 0.0;
  
-    std::vector<clip_image_size> image_res_candidates; // for llava-uhd style models
-    int32_t image_crop_resolution;
      std::unordered_set<int32_t> vision_feature_layer;
      int32_t attn_window_size = 0;
      int32_t n_wa_pattern = 0;
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp

index b7237d66168eb4b934f634606d11f87e4c8ff823..fd1cb0dfea4ec04249798327ccaf9af7072586d4 100644 (file)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1029,7 +1029,6 @@ struct clip_model_loader {
              if (is_vision) {
                  get_u32(KEY_IMAGE_SIZE, hparams.image_size);
                  get_u32(KEY_PATCH_SIZE, hparams.patch_size);
-                get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
                  get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
                  get_u32(KEY_MINICPMV_QUERY_NUM, hparams.minicpmv_query_num, false);
                  if (hparams.minicpmv_query_num == 0) {
@@ -1075,11 +1074,6 @@ struct clip_model_loader {
              // default warmup value
              hparams.warmup_image_size = hparams.image_size;
  
-            hparams.has_llava_projector = model.proj_type == PROJECTOR_TYPE_MLP
-                                       || model.proj_type == PROJECTOR_TYPE_MLP_NORM
-                                       || model.proj_type == PROJECTOR_TYPE_LDP
-                                       || model.proj_type == PROJECTOR_TYPE_LDPV2;
-
              {
                  bool use_gelu = false;
                  bool use_silu = false;
@@ -1135,14 +1129,41 @@ struct clip_model_loader {
  
              // model-specific params
              switch (model.proj_type) {
+                case PROJECTOR_TYPE_MLP:
+                case PROJECTOR_TYPE_MLP_NORM:
+                case PROJECTOR_TYPE_LDP:
+                case PROJECTOR_TYPE_LDPV2:
+                case PROJECTOR_TYPE_COGVLM:
+                    {
+                        hparams.has_llava_projector = model.proj_type != PROJECTOR_TYPE_COGVLM;
+                        hparams.image_pad_color     = {122, 116, 104};
+                        if (!hparams.image_res_candidates.empty()) {
+                            hparams.image_resize_pad  = true;
+                            hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
+                        } else {
+                            // llava-1.6 default params
+                            hparams.image_pad_ov         = false;
+                            hparams.image_pad_rf         = true;
+                            hparams.image_pad_color_rf   = {122, 116, 104};
+                            hparams.image_resize_algo_rf = RESIZE_ALGO_BICUBIC;
+                            hparams.image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
+                        }
+                    } break;
+                case PROJECTOR_TYPE_GLM_EDGE:
+                    {
+                        hparams.image_resize_pad  = true;
+                        hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
+                    } break;
                  case PROJECTOR_TYPE_MINICPMV:
                      {
+                        // use default llava-uhd preprocessing params
                          if (hparams.minicpmv_version == 0) {
                              hparams.minicpmv_version = 2; // default to 2 if not set
                          }
                      } break;
                  case PROJECTOR_TYPE_INTERNVL:
                      {
+                        // use default llava-uhd preprocessing params
                          // older version of internvl doesn't have min/max tiles, we need to provide default values for them to avoid issues
                          hparams.preproc_min_tiles = 1;
                          hparams.preproc_max_tiles = 12;
@@ -1158,11 +1179,15 @@ struct clip_model_loader {
                      } break;
                  case PROJECTOR_TYPE_IDEFICS3:
                      {
+                        // use default llava-uhd preprocessing params
                          get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                          get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.image_longest_edge, false);
                      } break;
                  case PROJECTOR_TYPE_LFM2:
                      {
+                        hparams.image_resize_algo    = RESIZE_ALGO_BILINEAR;
+                        hparams.image_resize_algo_rf = RESIZE_ALGO_BILINEAR;
+                        hparams.image_resize_algo_ov = RESIZE_ALGO_BILINEAR;
                          get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                          // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
                          hparams.set_limit_image_tokens(64, 256);
@@ -1170,6 +1195,7 @@ struct clip_model_loader {
                  case PROJECTOR_TYPE_PHI4:
                      {
                          hparams.n_merge = 1;
+                        hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
                          get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
                          get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
                          hparams.set_warmup_n_tokens(16*16);
@@ -1179,6 +1205,7 @@ struct clip_model_loader {
                          // ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
                          // TODO: verify the image_min_tokens
                          hparams.n_merge = 1; // the original pixtral does not use patch merging
+                        hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
                          hparams.rope_theta = 10000.0f;
                          get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                          hparams.set_limit_image_tokens(8, 1024);
@@ -1187,6 +1214,7 @@ struct clip_model_loader {
                  case PROJECTOR_TYPE_LIGHTONOCR:
                      {
                          hparams.n_merge = 1;
+                        hparams.image_resize_algo = RESIZE_ALGO_BICUBIC;
                          hparams.rope_theta = 10000.0f;
                          get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                          hparams.image_longest_edge = hparams.image_size;
@@ -1195,6 +1223,7 @@ struct clip_model_loader {
                      } break;
                  case PROJECTOR_TYPE_KIMIVL:
                      {
+                        hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
                          hparams.rope_theta = 10000.0f;
                          get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                          // TODO: check kimivl preprocessor for exact values
@@ -1203,6 +1232,7 @@ struct clip_model_loader {
                      } break;
                  case PROJECTOR_TYPE_KIMIK25:
                      {
+                        hparams.image_resize_algo = RESIZE_ALGO_BICUBIC;
                          hparams.rope_theta = 10000.0f;
                          get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
  
@@ -1222,6 +1252,7 @@ struct clip_model_loader {
                          // default value (used by all model sizes in gemma 3 family)
                          // number of patches for each **side** is reduced by a factor of 4
                          hparams.n_merge = 4;
+                        hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
                          // test model (tinygemma3) has a different value, we optionally read it
                          get_u32(KEY_PROJ_SCALE_FACTOR, hparams.n_merge, false);
                      } break;
@@ -1238,6 +1269,7 @@ struct clip_model_loader {
                  case PROJECTOR_TYPE_QWEN3VL:
                      {
                          hparams.n_merge = 2; // default value for Qwen 2 and 2.5
+                        hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
                          get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                          get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
                          // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
@@ -1253,6 +1285,8 @@ struct clip_model_loader {
                  case PROJECTOR_TYPE_YOUTUVL:
                      {
                          hparams.n_merge = 2;
+                        hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
+                        hparams.image_resize_pad  = false;
                          get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                          get_u32(KEY_ATTN_WINDOW_SIZE, hparams.attn_window_size, true);
                          std::vector<int> wa_layer_indexes_vec;
@@ -1268,6 +1302,7 @@ struct clip_model_loader {
                      {
                          hparams.rope_theta = 10000.0f;
                          hparams.n_merge = 2; // default value for GLM4-V
+                        hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
                          get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                          hparams.set_limit_image_tokens(8, 4096);
                          hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
@@ -1301,6 +1336,7 @@ struct clip_model_loader {
                  case PROJECTOR_TYPE_PADDLEOCR:
                      {
                          hparams.n_merge = 2;
+                        hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
                          get_u32(KEY_IMAGE_MIN_PIXELS, hparams.image_min_pixels);
                          get_u32(KEY_IMAGE_MAX_PIXELS, hparams.image_max_pixels);
  
@@ -1311,6 +1347,10 @@ struct clip_model_loader {
                          hparams.patch_size = 16;
                          hparams.image_size = 1024;
                          hparams.warmup_image_size = 1024;
+                        hparams.image_resize_algo = RESIZE_ALGO_BICUBIC_PILLOW;
+                        hparams.image_pad_color[0] = hparams.image_mean[0];
+                        hparams.image_pad_color[1] = hparams.image_mean[1];
+                        hparams.image_pad_color[2] = hparams.image_mean[2];
  
                          get_u32(KEY_SAM_N_BLOCK, hparams.sam_n_layer, true);
                          get_u32(KEY_SAM_N_HEAD, hparams.sam_n_head, true);
@@ -1326,8 +1366,13 @@ struct clip_model_loader {
                          hparams.audio_window_len       = 400;
                          hparams.audio_hop_len          = 160;
                      } break;
+                case PROJECTOR_TYPE_JANUS_PRO:
+                    {
+                        hparams.image_pad_color   = {127, 127, 127};
+                        hparams.image_resize_algo = RESIZE_ALGO_BILINEAR;
+                    } break;
                  default:
-                    break;
+                    throw std::runtime_error(string_format("%s: unknown vision projector type %s\n", __func__, proj_type.c_str()));
              }
  
              // sanity check
@@ -2385,1397 +2430,6 @@ void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny
      memcpy(img->buf.data(), rgb_pixels, img->buf.size());
  }
  
-// Normalize image to float32 - careful with pytorch .to(model.device, dtype=torch.float16) - this sometimes reduces precision (32>16>32), sometimes not
-static void normalize_image_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
-    dst.nx = src.nx;
-    dst.ny = src.ny;
-    dst.buf.resize(src.buf.size());
-
-    // TODO @ngxson : seems like this could be done more efficiently on cgraph
-    for (size_t i = 0; i < src.buf.size(); ++i) {
-        int c = i % 3; // rgb
-        dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
-    }
-}
-
-// set of tools to manipulate images
-// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
-struct img_tool {
-    enum resize_algo {
-        RESIZE_ALGO_BILINEAR,
-        RESIZE_ALGO_BICUBIC,
-        RESIZE_ALGO_BICUBIC_PILLOW,
-        // RESIZE_ALGO_LANCZOS, // TODO
-    };
-
-    static void resize(
-            const clip_image_u8 & src,
-            clip_image_u8 & dst,
-            const clip_image_size & target_resolution,
-            resize_algo algo,
-            bool add_padding = true, // TODO: define the behavior for add_padding = false
-            std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
-        dst.nx = target_resolution.width;
-        dst.ny = target_resolution.height;
-        dst.buf.resize(3 * dst.nx * dst.ny);
-
-        if (dst.nx == src.nx && dst.ny == src.ny) {
-            // no resize needed, simple copy
-            dst.buf = src.buf;
-            return;
-        }
-
-        if (!add_padding) {
-            // direct resize
-            switch (algo) {
-                case RESIZE_ALGO_BILINEAR:
-                    resize_bilinear(src, dst, target_resolution.width, target_resolution.height);
-                    break;
-                case RESIZE_ALGO_BICUBIC:
-                    resize_bicubic(src, dst, target_resolution.width, target_resolution.height);
-                    break;
-                case RESIZE_ALGO_BICUBIC_PILLOW:
-                    resize_bicubic_pillow(src, dst, target_resolution.width, target_resolution.height);
-                    break;
-                default:
-                    throw std::runtime_error("Unsupported resize algorithm");
-            }
-        } else {
-            // resize with padding
-            clip_image_u8 resized_image;
-            float scale_w = static_cast<float>(target_resolution.width) / src.nx;
-            float scale_h = static_cast<float>(target_resolution.height) / src.ny;
-            float scale = std::min(scale_w, scale_h);
-            int new_width  = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
-            int new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
-
-            switch (algo) {
-                case RESIZE_ALGO_BILINEAR:
-                    resize_bilinear(src, resized_image, new_width, new_height);
-                    break;
-                case RESIZE_ALGO_BICUBIC:
-                    resize_bicubic(src, resized_image, new_width, new_height);
-                    break;
-                case RESIZE_ALGO_BICUBIC_PILLOW:
-                    resize_bicubic_pillow(src, resized_image, new_width, new_height);
-                    break;
-                default:
-                    throw std::runtime_error("Unsupported resize algorithm");
-            }
-
-            // fill dst with pad_color
-            fill(dst, pad_color);
-
-            int offset_x = (target_resolution.width  - new_width)  / 2;
-            int offset_y = (target_resolution.height - new_height) / 2;
-
-            composite(dst, resized_image, offset_x, offset_y);
-        }
-    }
-
-    static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
-        dst.nx = w;
-        dst.ny = h;
-        dst.buf.resize(3 * w * h);
-
-        for (int i = 0; i < h; ++i) {
-            for (int j = 0; j < w; ++j) {
-                int src_idx = 3 * ((y + i)*image.nx + (x + j));
-                int dst_idx = 3 * (i*w + j);
-                dst.buf[dst_idx]     = image.buf[src_idx];
-                dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
-                dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
-            }
-        }
-    }
-
-    // calculate the size of the **resized** image, while preserving the aspect ratio
-    // the calculated size will be aligned to the nearest multiple of align_size
-    // if H or W size is larger than longest_edge, it will be resized to longest_edge
-    static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) {
-        GGML_ASSERT(align_size > 0);
-        if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0) {
-            return {0, 0};
-        }
-
-        float scale = std::min(static_cast<float>(longest_edge) / inp_size.width,
-                               static_cast<float>(longest_edge) / inp_size.height);
-
-        float target_width_f  = static_cast<float>(inp_size.width)  * scale;
-        float target_height_f = static_cast<float>(inp_size.height) * scale;
-
-        auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
-        int aligned_width  = ceil_by_factor(target_width_f);
-        int aligned_height = ceil_by_factor(target_height_f);
-
-        return {aligned_width, aligned_height};
-    }
-
-    // calculate the size of the **resized** image, while preserving the aspect ratio
-    // the calculated size will have min_pixels <= W*H <= max_pixels
-    // this is referred as "smart_resize" in transformers code
-    static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) {
-        GGML_ASSERT(align_size > 0);
-        const int width  = inp_size.width;
-        const int height = inp_size.height;
-
-        auto round_by_factor = [f = align_size](float x) { return static_cast<int>(std::round(x / static_cast<float>(f))) * f; };
-        auto ceil_by_factor  = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
-        auto floor_by_factor = [f = align_size](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
-
-        // always align up first
-        int h_bar = std::max(align_size, round_by_factor(height));
-        int w_bar = std::max(align_size, round_by_factor(width));
-
-        if (h_bar * w_bar > max_pixels) {
-            const auto beta = std::sqrt(static_cast<float>(height * width) / max_pixels);
-            h_bar = std::max(align_size, floor_by_factor(height / beta));
-            w_bar = std::max(align_size, floor_by_factor(width  / beta));
-        } else if (h_bar * w_bar < min_pixels) {
-            const auto beta = std::sqrt(static_cast<float>(min_pixels) / (height * width));
-            h_bar = ceil_by_factor(height * beta);
-            w_bar = ceil_by_factor(width * beta);
-        }
-
-        return {w_bar, h_bar};
-    }
-
-    // draw src image into dst image at offset (offset_x, offset_y)
-    static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
-        for (int y = 0; y < src.ny; ++y) {
-            for (int x = 0; x < src.nx; ++x) {
-                int dx = x + offset_x;
-                int dy = y + offset_y;
-                // skip pixels that would be out of bounds in the destination
-                if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) {
-                    continue;
-                }
-                size_t dst_idx = 3 * (static_cast<size_t>(dy) * dst.nx + static_cast<size_t>(dx));
-                size_t src_idx = 3 * (static_cast<size_t>(y) * src.nx + static_cast<size_t>(x));
-                dst.buf[dst_idx + 0] = src.buf[src_idx + 0];
-                dst.buf[dst_idx + 1] = src.buf[src_idx + 1];
-                dst.buf[dst_idx + 2] = src.buf[src_idx + 2];
-            }
-        }
-    }
-
-    // fill the image with a solid color
-    static void fill(clip_image_u8 & img, const std::array<uint8_t, 3> & color) {
-        for (size_t i = 0; i < img.buf.size(); i += 3) {
-            img.buf[i]     = color[0];
-            img.buf[i + 1] = color[1];
-            img.buf[i + 2] = color[2];
-        }
-    }
-
-private:
-    // Bilinear resize function
-    static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
-        dst.nx = target_width;
-        dst.ny = target_height;
-        dst.buf.resize(3 * target_width * target_height);
-
-        float x_ratio = static_cast<float>(src.nx - 1) / target_width;
-        float y_ratio = static_cast<float>(src.ny - 1) / target_height;
-
-        for (int y = 0; y < target_height; y++) {
-            for (int x = 0; x < target_width; x++) {
-                float px = x_ratio * x;
-                float py = y_ratio * y;
-                int x_floor = static_cast<int>(px);
-                int y_floor = static_cast<int>(py);
-                float x_lerp = px - x_floor;
-                float y_lerp = py - y_floor;
-
-                for (int c = 0; c < 3; c++) {
-                    float top = lerp(
-                        static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
-                        static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
-                        x_lerp
-                    );
-                    float bottom = lerp(
-                        static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
-                        static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
-                        x_lerp
-                    );
-                    dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
-                }
-            }
-        }
-    }
-
-    // Bicubic resize function
-    // part of image will be cropped if the aspect ratio is different
-    static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
-        const int nx = img.nx;
-        const int ny = img.ny;
-
-        dst.nx = target_width;
-        dst.ny = target_height;
-        dst.buf.resize(3 * target_width * target_height);
-
-        float Cc;
-        float C[5] = {};
-        float d0, d2, d3, a0, a1, a2, a3;
-        int i, j, k, jj;
-        int x, y;
-        float dx, dy;
-        float tx, ty;
-
-        tx = (float)nx / (float)target_width;
-        ty = (float)ny / (float)target_height;
-
-        // Bicubic interpolation; adapted from ViT.cpp, inspired from :
-        //    -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
-        //    -> https://en.wikipedia.org/wiki/Bicubic_interpolation
-
-        for (i = 0; i < target_height; i++) {
-            for (j = 0; j < target_width; j++) {
-                x = (int)(tx * j);
-                y = (int)(ty * i);
-
-                dx = tx * j - x;
-                dy = ty * i - y;
-
-                for (k = 0; k < 3; k++) {
-                    for (jj = 0; jj <= 3; jj++) {
-                        d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
-                        d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
-                        d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
-                        a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
-
-                        a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
-                        a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2;
-                        a3 = -1.0 / 6 * d0 -      1.0 / 2 * d2 + 1.0 / 6 * d3;
-
-                        C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
-
-                        d0 = C[0] - C[1];
-                        d2 = C[2] - C[1];
-                        d3 = C[3] - C[1];
-                        a0 = C[1];
-                        a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
-                        a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2;
-                        a3 = -1.0 / 6 * d0 -      1.0 / 2 * d2 + 1.0 / 6 * d3;
-                        Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
-
-                        const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
-                        dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
-                    }
-                }
-            }
-        }
-
-        return true;
-    }
-
-    // Bicubic resize function using Pillow's ImagingResample algorithm
-    // Adapted from https://github.com/python-pillow/Pillow/blob/main/src/libImaging/Resample.c
-    //
-    // Key Difference with resize_bicubic:
-    // 1. Uses separable filtering: horizontal pass followed by vertical pass
-    // 2. Pre-computes normalized filter coefficients for each output pixel
-    // 3. Applies convolution using fixed-point integer arithmetic for performance
-    static bool resize_bicubic_pillow(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
-        // Fixed-point precision: 22 bits = 32 (int32_t) - 8 (uint8_t pixels) - 2 (headroom for accumulation)
-        // This allows encoding fractional weights as integers: weight * 2^22
-        const int PRECISION_BITS = 32 - 8 - 2;
-
-        // Bicubic filter function with a = -0.5 (Note that GGML/PyTorch takes a = -0.75)
-        // Returns filter weight for distance x from pixel center
-        // Support: [-2, 2], meaning the filter influences pixels within 2 units of distance
-        auto bicubic_filter = [](double x) -> double {
-            constexpr double a = -0.5;
-            if (x < 0.0) {
-                x = -x;
-            }
-            if (x < 1.0) {
-                return ((a + 2.0) * x - (a + 3.0)) * x * x + 1;
-            }
-            if (x < 2.0) {
-                return (((x - 5) * x + 8) * x - 4) * a;
-            }
-            return 0.0;  // Zero outside [-2, 2]
-        };
-
-        // Filter support radius: bicubic extends 2 pixels in each direction
-        constexpr double filter_support = 2.0;
-
-        // Clipping function for 8-bit values
-        auto clip8 = [](int val) -> uint8_t {
-            if (val < 0) return 0;
-            if (val > 255) return 255;
-            return static_cast<uint8_t>(val);
-        };
-
-        // Precompute filter coefficients for ONE dimension (horizontal or vertical)
-        //
-        // Parameters:
-        //   inSize  - Number of pixels in input dimension (e.g., src_width or src_height)
-        //   outSize - Number of pixels in output dimension (e.g., target_width or target_height)
-        //   bounds  - [OUTPUT] Array of size outSize*2 storing input pixel ranges:
-        //             bounds[xx*2+0] = first input pixel index for output pixel xx (xmin)
-        //             bounds[xx*2+1] = number of input pixels for output pixel xx (xcnt)
-        //   weights - [OUTPUT] Array of size outSize*ksize storing fixed-point filter weights:
-        //             kk[xx*ksize + x] = weight for input pixel x contributing to output pixel xx
-        //
-        // Returns: kernel size (ksize) - number of input pixels that contribute to each output pixel
-        auto precompute_weights = [&](int inSize, int outSize,
-                                     std::vector<int> & bounds, std::vector<int32_t> & weights) -> int {
-            double support, scale, filterscale;
-            double center, ww, ss;
-            int xx, x, ksize, xmin, xmax, xcnt;
-
-            // Calculate scaling factor: ratio of input range to output size
-            filterscale = scale = (double)inSize / outSize;
-            // For upsampling (scale < 1), keep filterscale = 1 to maintain filter sharpness
-            // For downsampling (scale > 1), widen filter to prevent aliasing
-            if (filterscale < 1.0) {
-                filterscale = 1.0;
-            }
-
-            // Determine filter support radius and kernel size
-            support = filter_support * filterscale;  // Widen filter when downsampling
-            ksize = static_cast<int>(std::ceil(support)) * 2 + 1;  // Total pixels in kernel
-
-            std::vector<double> pre_weights(outSize * ksize);  // Temporary weights
-            bounds.resize(outSize * 2);
-
-            // For each output pixel, compute its filter coefficients
-            for (xx = 0; xx < outSize; xx++) {
-                // Calculate the center position in input space (pixel-center convention: +0.5)
-                center = (xx + 0.5) * scale;
-                ww = 0.0;  // Sum of weights for normalization
-                ss = 1.0 / filterscale;  // Scale factor for filter function
-
-                // Determine the range of input pixels that contribute to this output pixel
-                xmin = static_cast<int>(center - support + 0.5);
-                if (xmin < 0) {
-                    xmin = 0;
-                }
-
-                xmax = static_cast<int>(center + support + 0.5);
-                if (xmax > inSize) {
-                    xmax = inSize;
-                }
-
-                xcnt = xmax - xmin;
-
-                // Compute filter weights for each contributing input pixel
-                for (x = 0; x < xcnt; x++) {
-                    // Distance from input pixel center to output pixel center in input space
-                    double w = bicubic_filter((x + xmin - center + 0.5) * ss);
-                    pre_weights[xx * ksize + x] = w;
-                    ww += w;  // Accumulate for normalization
-                }
-
-                // Normalize weights to sum to 1.0 (preserves brightness)
-                for (x = 0; x < xcnt; x++) {
-                    if (ww != 0.0) {
-                        pre_weights[xx * ksize + x] /= ww;
-                    }
-                }
-
-                // Zero-pad remaining kernel positions
-                for (; x < ksize; x++) {
-                    pre_weights[xx * ksize + x] = 0;
-                }
-
-                // Store input pixel range for this output pixel
-                bounds[xx * 2 + 0] = xmin;
-                bounds[xx * 2 + 1] = xcnt;
-            }
-
-            // Convert floating-point coefficients to fixed-point integers
-            // Formula: int32 = round(float * 2^PRECISION_BITS)
-            weights.resize(outSize * ksize);
-            for (int i = 0; i < outSize * ksize; i++) {
-                if (pre_weights[i] < 0) {
-                    weights[i] = static_cast<int32_t>(-0.5 + pre_weights[i] * (1 << PRECISION_BITS));
-                } else {
-                    weights[i] = static_cast<int32_t>(0.5 + pre_weights[i] * (1 << PRECISION_BITS));
-                }
-            }
-
-            return ksize;
-        };
-
-        // Horizontal resampling pass
-        // Resizes width from imIn.nx to imOut.nx, preserving height
-        auto resample_horizontal = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut,
-                                       int ksize, const std::vector<int> & bounds, const std::vector<int32_t> & weights) {
-            imOut.ny = imIn.ny;
-            imOut.buf.resize(3 * imOut.nx * imOut.ny);
-
-            // Process each row independently
-            for (int yy = 0; yy < imOut.ny; yy++) {
-                // For each output pixel in this row
-                for (int xx = 0; xx < imOut.nx; xx++) {
-                    // Get the range of input pixels and filter coefficients
-                    int xmin = bounds[xx * 2 + 0];  // First input pixel index
-                    int xcnt = bounds[xx * 2 + 1];  // Number of input pixels
-
-                    // Initialize accumulators for RGB channels with rounding bias (0.5 in fixed-point)
-                    int32_t ss0 = 1 << (PRECISION_BITS - 1);
-                    int32_t ss1 = 1 << (PRECISION_BITS - 1);
-                    int32_t ss2 = 1 << (PRECISION_BITS - 1);
-
-                    // Convolve: sum weighted input pixels
-                    for (int x = 0; x < xcnt; x++) {
-                        int src_idx = ((yy * imIn.nx) + (x + xmin)) * 3;
-                        ss0 += static_cast<uint8_t>(imIn.buf[src_idx + 0]) * weights[xx * ksize + x];  // R channel
-                        ss1 += static_cast<uint8_t>(imIn.buf[src_idx + 1]) * weights[xx * ksize + x];  // G channel
-                        ss2 += static_cast<uint8_t>(imIn.buf[src_idx + 2]) * weights[xx * ksize + x];  // B channel
-                    }
-
-                    // Convert back from fixed-point (divide by 2^PRECISION_BITS) and clamp to [0,255]
-                    int dst_idx = (yy * imOut.nx + xx) * 3;
-                    imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS);
-                    imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS);
-                    imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS);
-                }
-            }
-        };
-
-        // Vertical resampling pass
-        // Resizes height from imIn.ny to imOut.ny, preserving width
-        auto resample_vertical = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut,
-                                     int ksize, const std::vector<int> & bounds, const std::vector<int32_t> & weight) {
-            imOut.nx = imIn.nx;
-            imOut.buf.resize(3 * imOut.nx * imOut.ny);
-
-            // For each output row
-            for (int yy = 0; yy < imOut.ny; yy++) {
-                // Get the range of input rows and filter coefficients
-                int ymin = bounds[yy * 2 + 0];  // First input row index
-                int ycnt = bounds[yy * 2 + 1];  // Number of input rows
-
-                // Process each column in this output row
-                for (int xx = 0; xx < imOut.nx; xx++) {
-                    // Initialize accumulators for RGB channels with rounding bias
-                    int32_t ss0 = 1 << (PRECISION_BITS - 1);
-                    int32_t ss1 = 1 << (PRECISION_BITS - 1);
-                    int32_t ss2 = 1 << (PRECISION_BITS - 1);
-
-                    // Convolve: sum weighted input pixels vertically
-                    for (int y = 0; y < ycnt; y++) {
-                        int src_idx = ((y + ymin) * imIn.nx + xx) * 3;
-                        ss0 += static_cast<uint8_t>(imIn.buf[src_idx + 0]) * weight[yy * ksize + y];  // R channel
-                        ss1 += static_cast<uint8_t>(imIn.buf[src_idx + 1]) * weight[yy * ksize + y];  // G channel
-                        ss2 += static_cast<uint8_t>(imIn.buf[src_idx + 2]) * weight[yy * ksize + y];  // B channel
-                    }
-
-                    // Convert back from fixed-point and clamp to [0,255]
-                    int dst_idx = (yy * imOut.nx + xx) * 3;
-                    imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS);
-                    imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS);
-                    imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS);
-                }
-            }
-        };
-
-        // Main resampling logic using separable two-pass approach
-        const int src_width = img.nx;
-        const int src_height = img.ny;
-
-        dst.nx = target_width;
-        dst.ny = target_height;
-
-        bool need_horizontal = (target_width != src_width);
-        bool need_vertical = (target_height != src_height);
-
-        // Precompute filter coefficients for both dimensions
-        std::vector<int> bounds_horiz, bounds_vert;
-        std::vector<int32_t> weights_horiz, weights_vert;
-        int ksize_horiz = 0, ksize_vert = 0;
-
-        if (need_horizontal) {
-            ksize_horiz = precompute_weights(src_width, target_width, bounds_horiz, weights_horiz);
-        }
-
-        if (need_vertical) {
-            ksize_vert = precompute_weights(src_height, target_height, bounds_vert, weights_vert);
-        }
-
-        // Perform two-pass resampling
-        if (need_horizontal && need_vertical) {
-            // Both horizontal and vertical
-            clip_image_u8 temp;
-            temp.nx = target_width;
-            resample_horizontal(img, temp, ksize_horiz, bounds_horiz, weights_horiz);
-            resample_vertical(temp, dst, ksize_vert, bounds_vert, weights_vert);
-        } else if (need_horizontal) {
-            // Only horizontal
-            resample_horizontal(img, dst, ksize_horiz, bounds_horiz, weights_horiz);
-        } else if (need_vertical) {
-            // Only vertical
-            resample_vertical(img, dst, ksize_vert, bounds_vert, weights_vert);
-        } else {
-            // No resizing needed - direct copy
-            dst.buf = img.buf;
-        }
-
-        return true;
-    }
-
-    static inline int clip(int x, int lower, int upper) {
-        return std::max(lower, std::min(x, upper));
-    }
-
-    // Linear interpolation between two points
-    static inline float lerp(float s, float e, float t) {
-        return s + (e - s) * t;
-    }
-};
-
-/**
- * implementation of LLaVA-UHD:
- *  - https://arxiv.org/pdf/2403.11703
- *  - https://github.com/thunlp/LLaVA-UHD
- *  - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
- *
- * overview:
- *   - an image always have a single overview (downscaled image)
- *   - an image can have 0 or multiple slices, depending on the image size
- *   - each slice can then be considered as a separate image
- *
- * for example:
- *
- * [overview] --> [slice 1] --> [slice 2]
- *           |                |
- *           +--> [slice 3] --> [slice 4]
- */
-struct llava_uhd {
-    struct slice_coordinates {
-        int x;
-        int y;
-        clip_image_size size;
-    };
-
-    struct slice_instructions {
-        clip_image_size overview_size; // size of downscaled image
-        clip_image_size refined_size;  // size of image right before slicing (must be multiple of slice size)
-        clip_image_size grid_size;     // grid_size.width * grid_size.height = number of slices
-        std::vector<slice_coordinates> slices;
-
-        img_tool::resize_algo interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR;
-        bool padding_overview = false;  // if true, refine image will be padded to the grid size (e.g. llava-1.6)
-        std::array<uint8_t, 3> pad_color_overview = {0, 0, 0};
-
-        img_tool::resize_algo interpolation_refined = img_tool::RESIZE_ALGO_BICUBIC;
-        bool padding_refined = false;  // if true, refine image will be padded to the grid size (e.g. llava-1.6)
-        std::array<uint8_t, 3> pad_color_refined = {0, 0, 0};
-    };
-
-    static slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
-        slice_instructions res;
-        const int patch_size      = clip_get_patch_size(ctx);
-        const int slice_size      = clip_get_image_size(ctx);
-        const int original_width  = original_size.width;
-        const int original_height = original_size.height;
-
-        const bool has_slices    = original_size.width > slice_size || original_size.height > slice_size;
-        const bool has_pinpoints = !ctx->model.hparams.image_res_candidates.empty();
-
-        if (!has_slices) {
-            // skip slicing logic
-            res.overview_size = clip_image_size{slice_size, slice_size};
-            res.refined_size  = clip_image_size{0, 0};
-            res.grid_size     = clip_image_size{0, 0};
-
-            return res;
-        }
-
-        if (has_pinpoints) {
-            // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
-            auto refine_size = llava_uhd::select_best_resolution(
-                original_size,
-                ctx->model.hparams.image_res_candidates);
-            res.overview_size         = clip_image_size{slice_size, slice_size};
-            res.refined_size          = refine_size;
-            res.grid_size             = clip_image_size{0, 0};
-            res.padding_refined       = true;
-            res.interpolation_refined = img_tool::RESIZE_ALGO_BILINEAR;  // preserve old behavior when padding
-
-            LOG_DBG("%s: using pinpoints for slicing\n", __func__);
-            LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
-                    __func__, original_width, original_height,
-                    res.overview_size.width, res.overview_size.height,
-                    res.refined_size.width,  res.refined_size.height);
-
-            for (int y = 0; y < refine_size.height; y += slice_size) {
-                for (int x = 0; x < refine_size.width; x += slice_size) {
-                    slice_coordinates slice;
-                    slice.x = x;
-                    slice.y = y;
-                    slice.size.width  = std::min(slice_size, refine_size.width  - x);
-                    slice.size.height = std::min(slice_size, refine_size.height - y);
-                    res.slices.push_back(slice);
-                    LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
-                            __func__, (int)res.slices.size() - 1,
-                            slice.x, slice.y, slice.size.width, slice.size.height);
-                }
-            }
-
-            res.grid_size.height = refine_size.height / slice_size;
-            res.grid_size.width  = refine_size.width  / slice_size;
-            LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height);
-
-            return res;
-        }
-
-        // no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
-
-        auto best_size    = get_best_resize(original_size, slice_size, patch_size, !has_slices);
-        res.overview_size = best_size;
-
-        {
-            const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
-            const float log_ratio = log((float)original_width / original_height);
-            const float ratio = (float)original_width * original_height / (slice_size * slice_size);
-            const int multiple = fmin(ceil(ratio), max_slice_nums);
-
-            auto best_grid   = get_best_grid(max_slice_nums, multiple, log_ratio);
-            auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
-            res.grid_size    = best_grid;
-            res.refined_size = refine_size;
-
-            LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
-                    __func__, original_width, original_height,
-                    res.overview_size.width, res.overview_size.height,
-                    res.refined_size.width, res.refined_size.height,
-                    res.grid_size.width, res.grid_size.height);
-
-            int width  = refine_size.width;
-            int height = refine_size.height;
-            int grid_x = int(width  / best_grid.width);
-            int grid_y = int(height / best_grid.height);
-            for (int patches_y = 0,                    ic = 0;
-                    patches_y < refine_size.height && ic < best_grid.height;
-                    patches_y += grid_y,              ic += 1) {
-                for (int patches_x = 0,                   jc = 0;
-                        patches_x < refine_size.width && jc < best_grid.width;
-                        patches_x += grid_x,             jc += 1) {
-                    slice_coordinates slice;
-                    slice.x = patches_x;
-                    slice.y = patches_y;
-                    slice.size.width  = grid_x;
-                    slice.size.height = grid_y;
-                    res.slices.push_back(slice);
-                    LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
-                            __func__, (int)res.slices.size() - 1,
-                            slice.x, slice.y, slice.size.width, slice.size.height);
-                }
-            }
-        }
-
-        return res;
-    }
-
-    static std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 * img, const slice_instructions & inst, bool overview_first = true) {
-        std::vector<clip_image_u8_ptr> output;
-
-        // resize to overview size
-        clip_image_u8_ptr resized_img(clip_image_u8_init());
-        img_tool::resize(*img, *resized_img, inst.overview_size, inst.interpolation_overview,
-                         inst.padding_overview, inst.pad_color_overview);
-        if (overview_first) {
-            output.push_back(std::move(resized_img));
-        }
-
-        if (inst.slices.empty()) {
-            // no slices, just return the resized image
-            if (!overview_first) {
-                output.push_back(std::move(resized_img));
-            }
-            return output;
-        }
-
-        // resize to refined size
-        clip_image_u8_ptr refined_img(clip_image_u8_init());
-        img_tool::resize(*img, *refined_img, inst.refined_size, inst.interpolation_refined,
-                         inst.padding_refined, inst.pad_color_refined);
-
-        // create slices
-        for (const auto & slice : inst.slices) {
-            int x = slice.x;
-            int y = slice.y;
-            int w = slice.size.width;
-            int h = slice.size.height;
-
-            clip_image_u8_ptr img_slice(clip_image_u8_init());
-            img_tool::crop(*refined_img, *img_slice, x, y, w, h);
-            output.push_back(std::move(img_slice));
-        }
-
-        if (!overview_first) {
-            output.push_back(std::move(resized_img));
-        }
-
-        return output;
-    }
-
-private:
-    static clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false) {
-        int width  = original_size.width;
-        int height = original_size.height;
-        if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
-            float r = static_cast<float>(width) / height;
-            height  = static_cast<int>(scale_resolution / std::sqrt(r));
-            width   = static_cast<int>(height * r);
-        }
-        clip_image_size res;
-        res.width  = ensure_divide(width,  patch_size);
-        res.height = ensure_divide(height, patch_size);
-        return res;
-    }
-
-    static clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) {
-        float scale_width  = static_cast<float>(target_max.width)  / orig.width;
-        float scale_height = static_cast<float>(target_max.height) / orig.height;
-        float scale = std::min(scale_width, scale_height);
-        return clip_image_size{
-            static_cast<int>(orig.width  * scale),
-            static_cast<int>(orig.height * scale),
-        };
-    }
-
-    /**
-     * Selects the best resolution from a list of possible resolutions based on the original size.
-     *
-     * For example, when given a list of resolutions:
-     *  - 100x100
-     *  - 200x100
-     *  - 100x200
-     *  - 200x200
-     *
-     * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
-     *
-     * @param original_size The original size of the image
-     * @param possible_resolutions A list of possible resolutions
-     * @return The best fit resolution
-     */
-    static clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
-        clip_image_size best_fit;
-        int min_wasted_area = std::numeric_limits<int>::max();
-        int max_effective_resolution = 0;
-
-        for (const clip_image_size & candidate : possible_resolutions) {
-            auto target_size = resize_maintain_aspect_ratio(original_size, candidate);
-            int effective_resolution = std::min(
-                target_size.width * target_size.height,
-                original_size.width * original_size.height);
-            int wasted_area = (candidate.width * candidate.height) - effective_resolution;
-
-            if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) {
-                max_effective_resolution = effective_resolution;
-                min_wasted_area = wasted_area;
-                best_fit = candidate;
-            }
-
-            LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution);
-        }
-
-        return best_fit;
-    }
-
-    static int ensure_divide(int length, int patch_size) {
-        return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
-    }
-
-    static clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false) {
-        int width  = original_size.width;
-        int height = original_size.height;
-        int grid_x = grid.width;
-        int grid_y = grid.height;
-
-        int refine_width  = ensure_divide(width, grid_x);
-        int refine_height = ensure_divide(height, grid_y);
-
-        clip_image_size grid_size;
-        grid_size.width  = refine_width  / grid_x;
-        grid_size.height = refine_height / grid_y;
-
-        auto best_grid_size  = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale);
-        int best_grid_width  = best_grid_size.width;
-        int best_grid_height = best_grid_size.height;
-
-        clip_image_size refine_size;
-        refine_size.width  = best_grid_width  * grid_x;
-        refine_size.height = best_grid_height * grid_y;
-        return refine_size;
-    }
-
-    static clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
-        std::vector<int> candidate_split_grids_nums;
-        for (int i : {multiple - 1, multiple, multiple + 1}) {
-            if (i == 1 || i > max_slice_nums) {
-                continue;
-            }
-            candidate_split_grids_nums.push_back(i);
-        }
-
-        std::vector<clip_image_size> candidate_grids;
-        for (int split_grids_nums : candidate_split_grids_nums) {
-            int m = 1;
-            while (m <= split_grids_nums) {
-                if (split_grids_nums % m == 0) {
-                    candidate_grids.push_back(clip_image_size{m, split_grids_nums / m});
-                }
-                ++m;
-            }
-        }
-
-        clip_image_size best_grid{1, 1};
-        float min_error = std::numeric_limits<float>::infinity();
-        for (const auto& grid : candidate_grids) {
-            float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height));
-            if (error < min_error) {
-                best_grid = grid;
-                min_error = error;
-            }
-        }
-        return best_grid;
-    }
-};
-
-// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
-// some of the logic is similar to llava_uhd, but with different hyperparameters and some logic is unique (e.g. grid layout)
-struct lfm2_vl_image_processor {
-    // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
-    static constexpr int   min_tiles            = 2;
-    static constexpr int   max_tiles            = 10;
-    static constexpr float max_pixels_tolerance = 2.0f;
-    static constexpr int   tile_size            = 512;
-
-    static llava_uhd::slice_instructions get_slice_instructions(struct clip_ctx * ctx, const clip_image_size & original_size) {
-        llava_uhd::slice_instructions inst;
-        const auto & params  = ctx->model.hparams;
-        const int align_size = params.patch_size * params.n_merge;
-
-        inst.interpolation_overview = img_tool::RESIZE_ALGO_BILINEAR;
-        inst.interpolation_refined  = img_tool::RESIZE_ALGO_BILINEAR;
-        inst.overview_size          = img_tool::calc_size_preserved_ratio(original_size, align_size, params.image_min_pixels, params.image_max_pixels);
-
-        // tile if either dimension exceeds tile_size with tolerance
-        const bool needs_tiling = original_size.width > tile_size * max_pixels_tolerance || original_size.height > tile_size * max_pixels_tolerance;
-
-        if (!needs_tiling) {
-            inst.refined_size = clip_image_size{0, 0};
-            inst.grid_size    = clip_image_size{0, 0};
-            return inst;
-        }
-
-        const clip_image_size grid = get_grid_layout(original_size.height, original_size.width);
-
-        inst.grid_size    = grid;
-        inst.refined_size = clip_image_size{tile_size * grid.width, tile_size * grid.height};
-
-        LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
-                __func__,
-                original_size.width, original_size.height,
-                inst.overview_size.width, inst.overview_size.height,
-                inst.refined_size.width, inst.refined_size.height,
-                grid.width, grid.height);
-
-        for (int row = 0; row < grid.height; row++) {
-            for (int col = 0; col < grid.width; col++) {
-                llava_uhd::slice_coordinates slice;
-                slice.x    = col * tile_size;
-                slice.y    = row * tile_size;
-                slice.size = clip_image_size{tile_size, tile_size};
-                inst.slices.push_back(slice);
-                LOG_DBG("%s: slice %d: x=%d, y=%d, size=%d x %d\n",
-                        __func__, (int)inst.slices.size() - 1,
-                        slice.x, slice.y, slice.size.width, slice.size.height);
-            }
-        }
-
-        return inst;
-    }
-
-private:
-    static clip_image_size find_closest_aspect_ratio(
-            float aspect_ratio,
-            const std::vector<clip_image_size> & target_ratios,
-            int width, int height) {
-        float best_ratio_diff = std::numeric_limits<float>::max();
-        clip_image_size best_ratio = {1, 1};
-        const float area = static_cast<float>(width * height);
-
-        for (const auto & ratio : target_ratios) {
-            const float target_aspect_ratio = static_cast<float>(ratio.width) / ratio.height;
-            const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio);
-            if (ratio_diff < best_ratio_diff) {
-                best_ratio_diff = ratio_diff;
-                best_ratio = ratio;
-            } else if (ratio_diff == best_ratio_diff) {
-                const float target_area = static_cast<float>(tile_size * tile_size * ratio.width * ratio.height);
-                if (area > 0.5f * target_area) {
-                    best_ratio = ratio;
-                }
-            }
-        }
-        return best_ratio;
-    }
-
-    static std::vector<clip_image_size> get_target_ratios() {
-        std::vector<clip_image_size> ratios;
-        for (int n = min_tiles; n <= max_tiles; n++) {
-            for (int w = 1; w <= n; w++) {
-                for (int h = 1; h <= n; h++) {
-                    if (w * h >= min_tiles && w * h <= max_tiles) {
-                        bool found = false;
-                        for (const auto & r : ratios) {
-                            if (r.width == w && r.height == h) {
-                                found = true;
-                                break;
-                            }
-                        }
-                        if (!found) {
-                            ratios.push_back({w, h});
-                        }
-                    }
-                }
-            }
-        }
-        std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) {
-            return a.width * a.height < b.width * b.height;
-        });
-        return ratios;
-    }
-
-    static clip_image_size get_grid_layout(int height, int width) {
-        const float aspect_ratio = static_cast<float>(width) / height;
-        const auto ratios = get_target_ratios();
-        return find_closest_aspect_ratio(aspect_ratio, ratios, width, height);
-    }
-};
-
-// returns the normalized float tensor for llava-1.5, for spatial_unpad with anyres processing for llava-1.6 it returns the normalized image patch tensors as a vector
-// res_imgs memory is being allocated here, previous allocations will be freed if found
-bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, struct clip_image_f32_batch * res_imgs) {
-    clip_image_size original_size{img->nx, img->ny};
-    auto & params = ctx->model.hparams;
-
-    switch (ctx->proj_type()) {
-        case PROJECTOR_TYPE_MINICPMV:
-            {
-                auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
-                std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
-
-                for (size_t i = 0; i < imgs.size(); ++i) {
-                    // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
-                    clip_image_f32_ptr res(clip_image_f32_init());
-                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
-                    res_imgs->entries.push_back(std::move(res));
-                }
-
-                res_imgs->grid_x = inst.grid_size.width;
-                res_imgs->grid_y = inst.grid_size.height;
-            } break;
-
-        case PROJECTOR_TYPE_QWEN2VL:
-        case PROJECTOR_TYPE_QWEN25VL:
-        case PROJECTOR_TYPE_QWEN3VL:
-        case PROJECTOR_TYPE_GLM4V:
-        case PROJECTOR_TYPE_PADDLEOCR:
-            {
-                GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
-                clip_image_u8 resized;
-                const clip_image_size new_size = img_tool::calc_size_preserved_ratio(
-                    original_size,
-                    params.patch_size * 2,
-                    params.image_min_pixels,
-                    params.image_max_pixels);
-                img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
-                // clip_image_save_to_bmp(resized, "preproc.bmp");
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                // clip_image_f32_ptr res(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
-                // res_imgs->data[0] = *res;
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;
-        case PROJECTOR_TYPE_YOUTUVL:
-            {
-                const int patch_size = params.patch_size;  // typically 16
-                const int merge_size = params.n_merge;      // typically 2
-                const int align_size = patch_size * merge_size;  // 32
-
-                const int max_num_patches = params.image_max_pixels > 0 ?
-                    params.image_max_pixels / (patch_size * patch_size) : 256;
-
-                // Linear search for optimal scale to fit within max_num_patches
-                float scale = 1.0f;
-                int target_height = original_size.height;
-                int target_width = original_size.width;
-
-                auto get_scaled_image_size = [align_size](float scale, int size) -> int {
-                    float scaled_size = size * scale;
-                    // Round up to nearest multiple of align_size
-                    int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
-                    // Ensure at least one patch
-                    return std::max(align_size, aligned);
-                };
-
-                // Linear search with 0.02 step size
-                while (scale > 0.0f) {
-                    target_height = get_scaled_image_size(scale, original_size.height);
-                    target_width = get_scaled_image_size(scale, original_size.width);
-
-                    int num_patches_h = target_height / patch_size;
-                    int num_patches_w = target_width / patch_size;
-                    int num_patches = num_patches_h * num_patches_w;
-
-                    if (num_patches > max_num_patches) {
-                        scale -= 0.02f;
-                    } else {
-                        break;
-                    }
-                }
-
-                clip_image_size new_size = {target_width, target_height};
-
-                // Resize the image
-                clip_image_u8 resized;
-                img_tool::resize(*img, resized, new_size, img_tool::RESIZE_ALGO_BILINEAR, false);
-
-                // Normalize to float32
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized, *img_f32, params.image_mean, params.image_std);
-
-                // Add to results
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;
-
-        case PROJECTOR_TYPE_IDEFICS3:
-            {
-                // The refined size has two steps:
-                // 1. Resize w/ aspect-ratio preserving such that the longer side is
-                //      the preprocessor longest size
-                // 2. Resize w/out preserving aspect ratio such that both sides are
-                //      multiples of image_size (always rounding up)
-                //
-                // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
-                const clip_image_size refined_size = img_tool::calc_size_preserved_ratio(
-                    original_size, params.image_size, params.image_longest_edge);
-                // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
-                //         __func__, original_size.width, original_size.height,
-                //         refined_size.width, refined_size.height);
-
-                llava_uhd::slice_instructions instructions;
-                instructions.overview_size = clip_image_size{params.image_size, params.image_size};
-                instructions.refined_size = refined_size;
-                instructions.grid_size = clip_image_size{
-                    static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
-                    static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
-                };
-                for (int y = 0; y < refined_size.height; y += params.image_size) {
-                    for (int x = 0; x < refined_size.width; x += params.image_size) {
-                        // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y);
-                        instructions.slices.push_back(llava_uhd::slice_coordinates{
-                            /* x    */x,
-                            /* y    */y,
-                            /* size */clip_image_size{
-                                std::min(params.image_size, refined_size.width - x),
-                                std::min(params.image_size, refined_size.height - y)
-                            }
-                        });
-                    }
-                }
-                auto imgs = llava_uhd::slice_image(img, instructions);
-
-                // cast and normalize to f32
-                for (size_t i = 0; i < imgs.size(); ++i) {
-                    // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
-                    clip_image_f32_ptr res(clip_image_f32_init());
-                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
-                    res_imgs->entries.push_back(std::move(res));
-                }
-
-                res_imgs->grid_x = instructions.grid_size.width;
-                res_imgs->grid_y = instructions.grid_size.height;
-            } break;
-        case PROJECTOR_TYPE_INTERNVL: // support dynamic high-resolution
-            {
-                GGML_ASSERT(!params.image_res_candidates.empty());
-                auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
-                std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst, false);
-
-                for (size_t i = 0; i < imgs.size(); ++i) {
-                    clip_image_f32_ptr res(clip_image_f32_init());
-                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
-                    res_imgs->entries.push_back(std::move(res));
-                }
-            } break;
-        case PROJECTOR_TYPE_GLM_EDGE:
-        case PROJECTOR_TYPE_GEMMA3:
-        case PROJECTOR_TYPE_NEMOTRON_V2_VL:
-            {
-                clip_image_u8 resized_image;
-                int sz = params.image_size;
-                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR);
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                //clip_image_save_to_bmp(resized_image, "resized.bmp");
-                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;
-
-        case PROJECTOR_TYPE_GEMMA3NV:
-            {
-                clip_image_u8 resized_image;
-                int sz = params.image_size;
-                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, false);
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;
-
-        case PROJECTOR_TYPE_JANUS_PRO:
-            {
-                // Janus Pro preprocessing: pad to square with gray(127), resize to 384x384
-                const std::array<uint8_t, 3> pad_color = {127, 127, 127};
-                clip_image_u8 resized_image;
-                int sz = params.image_size;
-                img_tool::resize(*img, resized_image, {sz, sz}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;
-
-        case PROJECTOR_TYPE_PHI4:
-        case PROJECTOR_TYPE_PIXTRAL:
-            {
-                GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
-                clip_image_u8 resized_image;
-                // the original pixtral model doesn't have n_merge
-                const int cur_merge = params.n_merge == 0 ? 1 : params.n_merge;
-                const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
-                    original_size,
-                    params.patch_size * cur_merge,
-                    params.image_min_pixels,
-                    params.image_max_pixels);
-                img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BILINEAR);
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;
-        case PROJECTOR_TYPE_LIGHTONOCR:
-            {
-                GGML_ASSERT(params.image_longest_edge > 0);
-                clip_image_u8 resized_image;
-                const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
-                    original_size,
-                    params.patch_size * params.n_merge,
-                    params.image_longest_edge);
-                img_tool::resize(*img, resized_image, target_size, img_tool::RESIZE_ALGO_BICUBIC);
-                clip_image_f32_ptr img_f32(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized_image, *img_f32, params.image_mean, params.image_std);
-                res_imgs->entries.push_back(std::move(img_f32));
-            } break;
-
-        case PROJECTOR_TYPE_LLAMA4:
-            {
-                GGML_ASSERT(!params.image_res_candidates.empty());
-                auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
-                std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
-
-                for (size_t i = 0; i < imgs.size(); ++i) {
-                    clip_image_f32_ptr res(clip_image_f32_init());
-                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
-                    res_imgs->entries.push_back(std::move(res));
-                }
-
-                res_imgs->grid_x = inst.grid_size.width;
-                res_imgs->grid_y = inst.grid_size.height;
-            } break;
-
-        case PROJECTOR_TYPE_LFM2:
-            {
-                auto const inst = lfm2_vl_image_processor::get_slice_instructions(ctx, original_size);
-                std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
-
-                for (size_t i = 0; i < imgs.size(); ++i) {
-                    clip_image_f32_ptr res(clip_image_f32_init());
-                    normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
-                    res_imgs->entries.push_back(std::move(res));
-                }
-
-                res_imgs->grid_x = inst.grid_size.width;
-                res_imgs->grid_y = inst.grid_size.height;
-            } break;
-
-        case PROJECTOR_TYPE_KIMIVL:
-            {
-                GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
-                const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
-                    original_size,
-                    params.patch_size * params.n_merge,
-                    params.image_min_pixels,
-                    params.image_max_pixels);
-                const std::array<uint8_t, 3> pad_color = {122, 116, 104};
-
-                clip_image_u8 resized_img;
-                img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
-                clip_image_f32_ptr res(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
-                res_imgs->entries.push_back(std::move(res));
-            } break;
-
-        case PROJECTOR_TYPE_KIMIK25:
-            {
-                GGML_ASSERT(params.image_min_pixels > 0 && params.image_max_pixels > 0);
-                const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
-                    original_size,
-                    params.patch_size * params.n_merge,
-                    params.image_min_pixels,
-                    params.image_max_pixels);
-                const std::array<uint8_t, 3> pad_color = {0, 0, 0};
-
-                clip_image_u8 resized_img;
-                img_tool::resize(*img, resized_img, target_size, img_tool::RESIZE_ALGO_BICUBIC, true, pad_color);
-                clip_image_f32_ptr res(clip_image_f32_init());
-                normalize_image_u8_to_f32(resized_img, *res, params.image_mean, params.image_std);
-                res_imgs->entries.push_back(std::move(res));
-            } break;
-
-        case PROJECTOR_TYPE_MLP:
-        case PROJECTOR_TYPE_MLP_NORM:
-        case PROJECTOR_TYPE_LDP:
-        case PROJECTOR_TYPE_LDPV2:
-        case PROJECTOR_TYPE_COGVLM: // TODO @ngxson : is this correct for cogvlm?
-            {
-                // TODO @ngxson : refactor the code below to avoid duplicated logic
-
-                // the logic below is to pad the shorter side to the longer side with a background color: rgb(122, 116, 104)
-                // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
-
-                clip_image_u8_ptr temp(clip_image_u8_init()); // we will keep the input image data here temporarily
-
-                // The model config actually contains all we need to decide on how to preprocess, here we automatically switch to the new llava-1.6 preprocessing
-                if (params.image_res_candidates.empty()) { // pad_to_square
-                    // for llava-1.5, we resize image to a square, and pad the shorter side with a background color
-                    // see https://github.com/haotian-liu/LLaVA/blob/e854a2bf85118c504f6f16bf5c3c7c92f8fa8c6b/llava/conversation.py#L113-L156
-                    const int longer_side = std::max(img->nx, img->ny);
-                    temp->nx = longer_side;
-                    temp->ny = longer_side;
-                    temp->buf.resize(3 * longer_side * longer_side);
-
-                    // background color in RGB from LLaVA (this is the mean rgb color * 255)
-                    const std::array<uint8_t, 3> pad_color = {122, 116, 104};
-
-                    // resize the image to the target_size
-                    img_tool::resize(*img, *temp, clip_image_size{params.image_size, params.image_size}, img_tool::RESIZE_ALGO_BILINEAR, true, pad_color);
-
-                    clip_image_f32_ptr res(clip_image_f32_init());
-                    normalize_image_u8_to_f32(*temp, *res, params.image_mean, params.image_std);
-                    res_imgs->entries.push_back(std::move(res));
-
-                } else {
-                    // "spatial_unpad" with "anyres" processing for llava-1.6
-                    auto const inst = llava_uhd::get_slice_instructions(ctx, original_size);
-                    std::vector<clip_image_u8_ptr> imgs = llava_uhd::slice_image(img, inst);
-
-                    for (size_t i = 0; i < imgs.size(); ++i) {
-                        // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
-                        clip_image_f32_ptr res(clip_image_f32_init());
-                        normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
-                        res_imgs->entries.push_back(std::move(res));
-                    }
-                }
-            } break;
-        case PROJECTOR_TYPE_DEEPSEEKOCR:
-            {
-                const std::vector native_resolutions = {
-                    /*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */
-                };
-                // original image size
-                const int orig_w = original_size.width;
-                const int orig_h = original_size.height;
-                const int orig_area = orig_h * orig_w;
-                std::array<uint8_t, 3u> color;
-
-                for (int i = 0; i < 3; i++) {
-                    color[i] = static_cast<unsigned char>(params.image_mean[i] * 255.0f);
-                }
-
-                size_t mode_i = 0;
-                int min_diff = orig_area;
-
-                for (size_t i = 0; i < native_resolutions.size(); i++) {
-                    int r = native_resolutions[i];
-                    if (std::abs(orig_area - r * r) < min_diff) {
-                        mode_i = i;
-                        min_diff = std::abs(orig_area - r * r);
-                    }
-                }
-
-                /* Native Resolution (Base/Large) */
-                const int image_size = native_resolutions[mode_i];
-
-                // Resize maintaining an aspect ratio, then pad to square
-                float scale = std::min(
-                    static_cast<float>(image_size) / orig_w,
-                    static_cast<float>(image_size) / orig_h
-                );
-                int new_w = static_cast<int>(orig_w * scale);
-                int new_h = static_cast<int>(orig_h * scale);
-
-                clip_image_u8_ptr scaled_img(clip_image_u8_init());
-                img_tool::resize(*img, *scaled_img, clip_image_size{new_w, new_h},
-                                img_tool::RESIZE_ALGO_BICUBIC_PILLOW, true, color);
-
-                // Use mean color for padding
-                unsigned char pad_r = static_cast<unsigned char>(params.image_mean[0] * 255.0f);
-                unsigned char pad_g = static_cast<unsigned char>(params.image_mean[1] * 255.0f);
-                unsigned char pad_b = static_cast<unsigned char>(params.image_mean[2] * 255.0f);
-
-                // Pad to image_size × image_size (center padding)
-                clip_image_u8_ptr padded_img(clip_image_u8_init());
-                padded_img->nx = image_size;
-                padded_img->ny = image_size;
-                padded_img->buf.resize(image_size * image_size * 3); // black padding
-
-                // Fill with mean color
-                for (int i = 0; i < image_size * image_size; ++i)
-                {
-                    padded_img->buf[i * 3 + 0] = pad_r;
-                    padded_img->buf[i * 3 + 1] = pad_g;
-                    padded_img->buf[i * 3 + 2] = pad_b;
-                }
-
-                // Calculate padding offsets (center the image)
-                int pad_x = (image_size - new_w) / 2;
-                int pad_y = (image_size - new_h) / 2;
-
-                // Copy scaled image into padded canvas
-                for (int y = 0; y < new_h; ++y){
-                    for (int x = 0; x < new_w; ++x){
-                        int src_idx = (y * new_w + x) * 3;
-                        int dst_idx = ((y + pad_y) * image_size + (x + pad_x)) * 3;
-                        padded_img->buf[dst_idx + 0] = scaled_img->buf[src_idx + 0];
-                        padded_img->buf[dst_idx + 1] = scaled_img->buf[src_idx + 1];
-                        padded_img->buf[dst_idx + 2] = scaled_img->buf[src_idx + 2];
-                    }
-                }
-
-                // Normalize and output
-                clip_image_f32_ptr res(clip_image_f32_init());
-                normalize_image_u8_to_f32(*padded_img, *res, params.image_mean, params.image_std);
-                res_imgs->entries.push_back(std::move(res));
-
-                res_imgs->grid_x = 1;
-                res_imgs->grid_y = 1;
-            } break;
-
-        default:
-            LOG_ERR("%s: unsupported projector type %d\n", __func__, ctx->proj_type());
-            return false;
-    }
-
-    return true;
-}
-
  ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx) {
      return ctx->model.image_newline;
  }
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h

index 71b58484d6ba4ed95ae23ba8360328bc37d50ec9..a859b38658d37b5598288e9caddc8795e2efccb7 100644 (file)
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -97,9 +97,6 @@ struct clip_image_f32 * clip_image_f32_get_img(const struct clip_image_f32_batch
   */
  void clip_build_img_from_pixels(const unsigned char * rgb_pixels, int nx, int ny, struct clip_image_u8 * img);
  
-/** preprocess img and store the result in res_imgs, pad_to_square may be overridden to false depending on model configuration */
-bool clip_image_preprocess(struct clip_ctx * ctx, const struct clip_image_u8 * img, struct clip_image_f32_batch * res_imgs );
-
  struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ctx);
  
  bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp

new file mode 100644 (file)

index 0000000..b446437
--- /dev/null
+++ b/tools/mtmd/mtmd-image.cpp
@@ -0,0 +1,1166 @@
+#include "mtmd-image.h"
+
+#include <algorithm>
+#include <cmath>
+#include <vector>
+
+//
+// base implementation
+//
+
+void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]) {
+    dst.nx = src.nx;
+    dst.ny = src.ny;
+    dst.buf.resize(src.buf.size());
+
+    // TODO @ngxson : seems like this could be done more efficiently on cgraph
+    for (size_t i = 0; i < src.buf.size(); ++i) {
+        int c = i % 3; // rgb
+        dst.buf[i] = (static_cast<float>(src.buf[i]) / 255.0f - mean[c]) / std[c];
+    }
+}
+
+void mtmd_image_preprocessor::img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst) {
+    dst.nx = src.nx;
+    dst.ny = src.ny;
+    dst.buf.resize(src.buf.size());
+
+    for (size_t i = 0; i < src.buf.size(); ++i) {
+        dst.buf[i] = static_cast<float>(src.buf[i]);
+    }
+}
+
+// set of tools to manipulate images
+// in the future, we can have HW acceleration by allowing this struct to access 3rd party lib like imagick or opencv
+struct img_tool {
+    static void resize(
+            const clip_image_u8 & src,
+            clip_image_u8 & dst,
+            const clip_image_size & target_resolution,
+            resize_algo algo,
+            bool add_padding = true, // TODO: define the behavior for add_padding = false
+            std::array<uint8_t, 3> pad_color = {0, 0, 0}) {
+        dst.nx = target_resolution.width;
+        dst.ny = target_resolution.height;
+        dst.buf.resize(3 * dst.nx * dst.ny);
+
+        if (dst.nx == src.nx && dst.ny == src.ny) {
+            // no resize needed, simple copy
+            dst.buf = src.buf;
+            return;
+        }
+
+        if (!add_padding) {
+            // direct resize
+            switch (algo) {
+                case RESIZE_ALGO_BILINEAR:
+                    resize_bilinear(src, dst, target_resolution.width, target_resolution.height);
+                    break;
+                case RESIZE_ALGO_BICUBIC:
+                    resize_bicubic(src, dst, target_resolution.width, target_resolution.height);
+                    break;
+                case RESIZE_ALGO_BICUBIC_PILLOW:
+                    resize_bicubic_pillow(src, dst, target_resolution.width, target_resolution.height);
+                    break;
+                default:
+                    throw std::runtime_error("Unsupported resize algorithm");
+            }
+        } else {
+            // resize with padding
+            clip_image_u8 resized_image;
+            float scale_w = static_cast<float>(target_resolution.width) / src.nx;
+            float scale_h = static_cast<float>(target_resolution.height) / src.ny;
+            float scale = std::min(scale_w, scale_h);
+            int new_width  = std::min(static_cast<int>(std::ceil(src.nx * scale)), target_resolution.width);
+            int new_height = std::min(static_cast<int>(std::ceil(src.ny * scale)), target_resolution.height);
+
+            switch (algo) {
+                case RESIZE_ALGO_BILINEAR:
+                    resize_bilinear(src, resized_image, new_width, new_height);
+                    break;
+                case RESIZE_ALGO_BICUBIC:
+                    resize_bicubic(src, resized_image, new_width, new_height);
+                    break;
+                case RESIZE_ALGO_BICUBIC_PILLOW:
+                    resize_bicubic_pillow(src, resized_image, new_width, new_height);
+                    break;
+                default:
+                    throw std::runtime_error("Unsupported resize algorithm");
+            }
+
+            // fill dst with pad_color
+            fill(dst, pad_color);
+
+            int offset_x = (target_resolution.width  - new_width)  / 2;
+            int offset_y = (target_resolution.height - new_height) / 2;
+
+            composite(dst, resized_image, offset_x, offset_y);
+        }
+    }
+
+    static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
+        dst.nx = w;
+        dst.ny = h;
+        dst.buf.resize(3 * w * h);
+
+        for (int i = 0; i < h; ++i) {
+            for (int j = 0; j < w; ++j) {
+                int src_idx = 3 * ((y + i)*image.nx + (x + j));
+                int dst_idx = 3 * (i*w + j);
+                dst.buf[dst_idx]     = image.buf[src_idx];
+                dst.buf[dst_idx + 1] = image.buf[src_idx + 1];
+                dst.buf[dst_idx + 2] = image.buf[src_idx + 2];
+            }
+        }
+    }
+
+    // calculate the size of the **resized** image, while preserving the aspect ratio
+    // the calculated size will be aligned to the nearest multiple of align_size
+    // if H or W size is larger than longest_edge, it will be resized to longest_edge
+    static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int longest_edge) {
+        GGML_ASSERT(align_size > 0);
+        if (inp_size.width <= 0 || inp_size.height <= 0 || longest_edge <= 0) {
+            return {0, 0};
+        }
+
+        float scale = std::min(static_cast<float>(longest_edge) / inp_size.width,
+                               static_cast<float>(longest_edge) / inp_size.height);
+
+        float target_width_f  = static_cast<float>(inp_size.width)  * scale;
+        float target_height_f = static_cast<float>(inp_size.height) * scale;
+
+        auto ceil_by_factor = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
+        int aligned_width  = ceil_by_factor(target_width_f);
+        int aligned_height = ceil_by_factor(target_height_f);
+
+        return {aligned_width, aligned_height};
+    }
+
+    // calculate the size of the **resized** image, while preserving the aspect ratio
+    // the calculated size will have min_pixels <= W*H <= max_pixels
+    // this is referred as "smart_resize" in transformers code
+    static clip_image_size calc_size_preserved_ratio(const clip_image_size & inp_size, const int align_size, const int min_pixels, const int max_pixels) {
+        GGML_ASSERT(align_size > 0);
+        const int width  = inp_size.width;
+        const int height = inp_size.height;
+
+        auto round_by_factor = [f = align_size](float x) { return static_cast<int>(std::round(x / static_cast<float>(f))) * f; };
+        auto ceil_by_factor  = [f = align_size](float x) { return static_cast<int>(std::ceil(x / static_cast<float>(f))) * f; };
+        auto floor_by_factor = [f = align_size](float x) { return static_cast<int>(std::floor(x / static_cast<float>(f))) * f; };
+
+        // always align up first
+        int h_bar = std::max(align_size, round_by_factor(height));
+        int w_bar = std::max(align_size, round_by_factor(width));
+
+        if (h_bar * w_bar > max_pixels) {
+            const auto beta = std::sqrt(static_cast<float>(height * width) / max_pixels);
+            h_bar = std::max(align_size, floor_by_factor(height / beta));
+            w_bar = std::max(align_size, floor_by_factor(width  / beta));
+        } else if (h_bar * w_bar < min_pixels) {
+            const auto beta = std::sqrt(static_cast<float>(min_pixels) / (height * width));
+            h_bar = ceil_by_factor(height * beta);
+            w_bar = ceil_by_factor(width * beta);
+        }
+
+        return {w_bar, h_bar};
+    }
+
+    // draw src image into dst image at offset (offset_x, offset_y)
+    static void composite(clip_image_u8 & dst, const clip_image_u8 & src, int offset_x, int offset_y) {
+        for (int y = 0; y < src.ny; ++y) {
+            for (int x = 0; x < src.nx; ++x) {
+                int dx = x + offset_x;
+                int dy = y + offset_y;
+                // skip pixels that would be out of bounds in the destination
+                if (dx < 0 || dy < 0 || dx >= dst.nx || dy >= dst.ny) {
+                    continue;
+                }
+                size_t dst_idx = 3 * (static_cast<size_t>(dy) * dst.nx + static_cast<size_t>(dx));
+                size_t src_idx = 3 * (static_cast<size_t>(y) * src.nx + static_cast<size_t>(x));
+                dst.buf[dst_idx + 0] = src.buf[src_idx + 0];
+                dst.buf[dst_idx + 1] = src.buf[src_idx + 1];
+                dst.buf[dst_idx + 2] = src.buf[src_idx + 2];
+            }
+        }
+    }
+
+    // fill the image with a solid color
+    static void fill(clip_image_u8 & img, const std::array<uint8_t, 3> & color) {
+        for (size_t i = 0; i < img.buf.size(); i += 3) {
+            img.buf[i]     = color[0];
+            img.buf[i + 1] = color[1];
+            img.buf[i + 2] = color[2];
+        }
+    }
+
+private:
+    // Bilinear resize function
+    static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
+        dst.nx = target_width;
+        dst.ny = target_height;
+        dst.buf.resize(3 * target_width * target_height);
+
+        float x_ratio = static_cast<float>(src.nx - 1) / target_width;
+        float y_ratio = static_cast<float>(src.ny - 1) / target_height;
+
+        for (int y = 0; y < target_height; y++) {
+            for (int x = 0; x < target_width; x++) {
+                float px = x_ratio * x;
+                float py = y_ratio * y;
+                int x_floor = static_cast<int>(px);
+                int y_floor = static_cast<int>(py);
+                float x_lerp = px - x_floor;
+                float y_lerp = py - y_floor;
+
+                for (int c = 0; c < 3; c++) {
+                    float top = lerp(
+                        static_cast<float>(src.buf[3 * (y_floor * src.nx + x_floor) + c]),
+                        static_cast<float>(src.buf[3 * (y_floor * src.nx + (x_floor + 1)) + c]),
+                        x_lerp
+                    );
+                    float bottom = lerp(
+                        static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + x_floor) + c]),
+                        static_cast<float>(src.buf[3 * ((y_floor + 1) * src.nx + (x_floor + 1)) + c]),
+                        x_lerp
+                    );
+                    dst.buf[3 * (y * target_width + x) + c] = static_cast<uint8_t>(lerp(top, bottom, y_lerp));
+                }
+            }
+        }
+    }
+
+    // Bicubic resize function
+    // part of image will be cropped if the aspect ratio is different
+    static bool resize_bicubic(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
+        const int nx = img.nx;
+        const int ny = img.ny;
+
+        dst.nx = target_width;
+        dst.ny = target_height;
+        dst.buf.resize(3 * target_width * target_height);
+
+        float Cc;
+        float C[5] = {};
+        float d0, d2, d3, a0, a1, a2, a3;
+        int i, j, k, jj;
+        int x, y;
+        float dx, dy;
+        float tx, ty;
+
+        tx = (float)nx / (float)target_width;
+        ty = (float)ny / (float)target_height;
+
+        // Bicubic interpolation; adapted from ViT.cpp, inspired from :
+        //    -> https://github.com/yglukhov/bicubic-interpolation-image-processing/blob/master/libimage.c#L36
+        //    -> https://en.wikipedia.org/wiki/Bicubic_interpolation
+
+        for (i = 0; i < target_height; i++) {
+            for (j = 0; j < target_width; j++) {
+                x = (int)(tx * j);
+                y = (int)(ty * i);
+
+                dx = tx * j - x;
+                dy = ty * i - y;
+
+                for (k = 0; k < 3; k++) {
+                    for (jj = 0; jj <= 3; jj++) {
+                        d0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x - 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                        d2 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 1, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                        d3 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x + 2, 0, nx - 1)) * 3 + k] - img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+                        a0 = img.buf[(clip(y - 1 + jj, 0, ny - 1) * nx + clip(x, 0, nx - 1)) * 3 + k];
+
+                        a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
+                        a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2;
+                        a3 = -1.0 / 6 * d0 -      1.0 / 2 * d2 + 1.0 / 6 * d3;
+
+                        C[jj] = a0 + a1 * dx + a2 * dx * dx + a3 * dx * dx * dx;
+
+                        d0 = C[0] - C[1];
+                        d2 = C[2] - C[1];
+                        d3 = C[3] - C[1];
+                        a0 = C[1];
+                        a1 = -1.0 / 3 * d0 + d2 - 1.0 / 6 * d3;
+                        a2 =  1.0 / 2 * d0 +      1.0 / 2 * d2;
+                        a3 = -1.0 / 6 * d0 -      1.0 / 2 * d2 + 1.0 / 6 * d3;
+                        Cc = a0 + a1 * dy + a2 * dy * dy + a3 * dy * dy * dy;
+
+                        const uint8_t Cc2 = std::min(std::max(std::round(Cc), 0.0f), 255.0f);
+                        dst.buf[(i * target_width + j) * 3 + k] = float(Cc2);
+                    }
+                }
+            }
+        }
+
+        return true;
+    }
+
+    // Bicubic resize function using Pillow's ImagingResample algorithm
+    // Adapted from https://github.com/python-pillow/Pillow/blob/main/src/libImaging/Resample.c
+    //
+    // Key Difference with resize_bicubic:
+    // 1. Uses separable filtering: horizontal pass followed by vertical pass
+    // 2. Pre-computes normalized filter coefficients for each output pixel
+    // 3. Applies convolution using fixed-point integer arithmetic for performance
+    static bool resize_bicubic_pillow(const clip_image_u8 & img, clip_image_u8 & dst, int target_width, int target_height) {
+        // Fixed-point precision: 22 bits = 32 (int32_t) - 8 (uint8_t pixels) - 2 (headroom for accumulation)
+        // This allows encoding fractional weights as integers: weight * 2^22
+        const int PRECISION_BITS = 32 - 8 - 2;
+
+        // Bicubic filter function with a = -0.5 (Note that GGML/PyTorch takes a = -0.75)
+        // Returns filter weight for distance x from pixel center
+        // Support: [-2, 2], meaning the filter influences pixels within 2 units of distance
+        auto bicubic_filter = [](double x) -> double {
+            constexpr double a = -0.5;
+            if (x < 0.0) {
+                x = -x;
+            }
+            if (x < 1.0) {
+                return ((a + 2.0) * x - (a + 3.0)) * x * x + 1;
+            }
+            if (x < 2.0) {
+                return (((x - 5) * x + 8) * x - 4) * a;
+            }
+            return 0.0;  // Zero outside [-2, 2]
+        };
+
+        // Filter support radius: bicubic extends 2 pixels in each direction
+        constexpr double filter_support = 2.0;
+
+        // Clipping function for 8-bit values
+        auto clip8 = [](int val) -> uint8_t {
+            if (val < 0) return 0;
+            if (val > 255) return 255;
+            return static_cast<uint8_t>(val);
+        };
+
+        // Precompute filter coefficients for ONE dimension (horizontal or vertical)
+        //
+        // Parameters:
+        //   inSize  - Number of pixels in input dimension (e.g., src_width or src_height)
+        //   outSize - Number of pixels in output dimension (e.g., target_width or target_height)
+        //   bounds  - [OUTPUT] Array of size outSize*2 storing input pixel ranges:
+        //             bounds[xx*2+0] = first input pixel index for output pixel xx (xmin)
+        //             bounds[xx*2+1] = number of input pixels for output pixel xx (xcnt)
+        //   weights - [OUTPUT] Array of size outSize*ksize storing fixed-point filter weights:
+        //             kk[xx*ksize + x] = weight for input pixel x contributing to output pixel xx
+        //
+        // Returns: kernel size (ksize) - number of input pixels that contribute to each output pixel
+        auto precompute_weights = [&](int inSize, int outSize,
+                                     std::vector<int> & bounds, std::vector<int32_t> & weights) -> int {
+            double support, scale, filterscale;
+            double center, ww, ss;
+            int xx, x, ksize, xmin, xmax, xcnt;
+
+            // Calculate scaling factor: ratio of input range to output size
+            filterscale = scale = (double)inSize / outSize;
+            // For upsampling (scale < 1), keep filterscale = 1 to maintain filter sharpness
+            // For downsampling (scale > 1), widen filter to prevent aliasing
+            if (filterscale < 1.0) {
+                filterscale = 1.0;
+            }
+
+            // Determine filter support radius and kernel size
+            support = filter_support * filterscale;  // Widen filter when downsampling
+            ksize = static_cast<int>(std::ceil(support)) * 2 + 1;  // Total pixels in kernel
+
+            std::vector<double> pre_weights(outSize * ksize);  // Temporary weights
+            bounds.resize(outSize * 2);
+
+            // For each output pixel, compute its filter coefficients
+            for (xx = 0; xx < outSize; xx++) {
+                // Calculate the center position in input space (pixel-center convention: +0.5)
+                center = (xx + 0.5) * scale;
+                ww = 0.0;  // Sum of weights for normalization
+                ss = 1.0 / filterscale;  // Scale factor for filter function
+
+                // Determine the range of input pixels that contribute to this output pixel
+                xmin = static_cast<int>(center - support + 0.5);
+                if (xmin < 0) {
+                    xmin = 0;
+                }
+
+                xmax = static_cast<int>(center + support + 0.5);
+                if (xmax > inSize) {
+                    xmax = inSize;
+                }
+
+                xcnt = xmax - xmin;
+
+                // Compute filter weights for each contributing input pixel
+                for (x = 0; x < xcnt; x++) {
+                    // Distance from input pixel center to output pixel center in input space
+                    double w = bicubic_filter((x + xmin - center + 0.5) * ss);
+                    pre_weights[xx * ksize + x] = w;
+                    ww += w;  // Accumulate for normalization
+                }
+
+                // Normalize weights to sum to 1.0 (preserves brightness)
+                for (x = 0; x < xcnt; x++) {
+                    if (ww != 0.0) {
+                        pre_weights[xx * ksize + x] /= ww;
+                    }
+                }
+
+                // Zero-pad remaining kernel positions
+                for (; x < ksize; x++) {
+                    pre_weights[xx * ksize + x] = 0;
+                }
+
+                // Store input pixel range for this output pixel
+                bounds[xx * 2 + 0] = xmin;
+                bounds[xx * 2 + 1] = xcnt;
+            }
+
+            // Convert floating-point coefficients to fixed-point integers
+            // Formula: int32 = round(float * 2^PRECISION_BITS)
+            weights.resize(outSize * ksize);
+            for (int i = 0; i < outSize * ksize; i++) {
+                if (pre_weights[i] < 0) {
+                    weights[i] = static_cast<int32_t>(-0.5 + pre_weights[i] * (1 << PRECISION_BITS));
+                } else {
+                    weights[i] = static_cast<int32_t>(0.5 + pre_weights[i] * (1 << PRECISION_BITS));
+                }
+            }
+
+            return ksize;
+        };
+
+        // Horizontal resampling pass
+        // Resizes width from imIn.nx to imOut.nx, preserving height
+        auto resample_horizontal = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut,
+                                       int ksize, const std::vector<int> & bounds, const std::vector<int32_t> & weights) {
+            imOut.ny = imIn.ny;
+            imOut.buf.resize(3 * imOut.nx * imOut.ny);
+
+            // Process each row independently
+            for (int yy = 0; yy < imOut.ny; yy++) {
+                // For each output pixel in this row
+                for (int xx = 0; xx < imOut.nx; xx++) {
+                    // Get the range of input pixels and filter coefficients
+                    int xmin = bounds[xx * 2 + 0];  // First input pixel index
+                    int xcnt = bounds[xx * 2 + 1];  // Number of input pixels
+
+                    // Initialize accumulators for RGB channels with rounding bias (0.5 in fixed-point)
+                    int32_t ss0 = 1 << (PRECISION_BITS - 1);
+                    int32_t ss1 = 1 << (PRECISION_BITS - 1);
+                    int32_t ss2 = 1 << (PRECISION_BITS - 1);
+
+                    // Convolve: sum weighted input pixels
+                    for (int x = 0; x < xcnt; x++) {
+                        int src_idx = ((yy * imIn.nx) + (x + xmin)) * 3;
+                        ss0 += static_cast<uint8_t>(imIn.buf[src_idx + 0]) * weights[xx * ksize + x];  // R channel
+                        ss1 += static_cast<uint8_t>(imIn.buf[src_idx + 1]) * weights[xx * ksize + x];  // G channel
+                        ss2 += static_cast<uint8_t>(imIn.buf[src_idx + 2]) * weights[xx * ksize + x];  // B channel
+                    }
+
+                    // Convert back from fixed-point (divide by 2^PRECISION_BITS) and clamp to [0,255]
+                    int dst_idx = (yy * imOut.nx + xx) * 3;
+                    imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS);
+                    imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS);
+                    imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS);
+                }
+            }
+        };
+
+        // Vertical resampling pass
+        // Resizes height from imIn.ny to imOut.ny, preserving width
+        auto resample_vertical = [&](const clip_image_u8 & imIn, clip_image_u8 & imOut,
+                                     int ksize, const std::vector<int> & bounds, const std::vector<int32_t> & weight) {
+            imOut.nx = imIn.nx;
+            imOut.buf.resize(3 * imOut.nx * imOut.ny);
+
+            // For each output row
+            for (int yy = 0; yy < imOut.ny; yy++) {
+                // Get the range of input rows and filter coefficients
+                int ymin = bounds[yy * 2 + 0];  // First input row index
+                int ycnt = bounds[yy * 2 + 1];  // Number of input rows
+
+                // Process each column in this output row
+                for (int xx = 0; xx < imOut.nx; xx++) {
+                    // Initialize accumulators for RGB channels with rounding bias
+                    int32_t ss0 = 1 << (PRECISION_BITS - 1);
+                    int32_t ss1 = 1 << (PRECISION_BITS - 1);
+                    int32_t ss2 = 1 << (PRECISION_BITS - 1);
+
+                    // Convolve: sum weighted input pixels vertically
+                    for (int y = 0; y < ycnt; y++) {
+                        int src_idx = ((y + ymin) * imIn.nx + xx) * 3;
+                        ss0 += static_cast<uint8_t>(imIn.buf[src_idx + 0]) * weight[yy * ksize + y];  // R channel
+                        ss1 += static_cast<uint8_t>(imIn.buf[src_idx + 1]) * weight[yy * ksize + y];  // G channel
+                        ss2 += static_cast<uint8_t>(imIn.buf[src_idx + 2]) * weight[yy * ksize + y];  // B channel
+                    }
+
+                    // Convert back from fixed-point and clamp to [0,255]
+                    int dst_idx = (yy * imOut.nx + xx) * 3;
+                    imOut.buf[dst_idx + 0] = clip8(ss0 >> PRECISION_BITS);
+                    imOut.buf[dst_idx + 1] = clip8(ss1 >> PRECISION_BITS);
+                    imOut.buf[dst_idx + 2] = clip8(ss2 >> PRECISION_BITS);
+                }
+            }
+        };
+
+        // Main resampling logic using separable two-pass approach
+        const int src_width = img.nx;
+        const int src_height = img.ny;
+
+        dst.nx = target_width;
+        dst.ny = target_height;
+
+        bool need_horizontal = (target_width != src_width);
+        bool need_vertical = (target_height != src_height);
+
+        // Precompute filter coefficients for both dimensions
+        std::vector<int> bounds_horiz, bounds_vert;
+        std::vector<int32_t> weights_horiz, weights_vert;
+        int ksize_horiz = 0, ksize_vert = 0;
+
+        if (need_horizontal) {
+            ksize_horiz = precompute_weights(src_width, target_width, bounds_horiz, weights_horiz);
+        }
+
+        if (need_vertical) {
+            ksize_vert = precompute_weights(src_height, target_height, bounds_vert, weights_vert);
+        }
+
+        // Perform two-pass resampling
+        if (need_horizontal && need_vertical) {
+            // Both horizontal and vertical
+            clip_image_u8 temp;
+            temp.nx = target_width;
+            resample_horizontal(img, temp, ksize_horiz, bounds_horiz, weights_horiz);
+            resample_vertical(temp, dst, ksize_vert, bounds_vert, weights_vert);
+        } else if (need_horizontal) {
+            // Only horizontal
+            resample_horizontal(img, dst, ksize_horiz, bounds_horiz, weights_horiz);
+        } else if (need_vertical) {
+            // Only vertical
+            resample_vertical(img, dst, ksize_vert, bounds_vert, weights_vert);
+        } else {
+            // No resizing needed - direct copy
+            dst.buf = img.buf;
+        }
+
+        return true;
+    }
+
+    static inline int clip(int x, int lower, int upper) {
+        return std::max(lower, std::min(x, upper));
+    }
+
+    // Linear interpolation between two points
+    static inline float lerp(float s, float e, float t) {
+        return s + (e - s) * t;
+    }
+};
+
+
+//
+// mtmd_image_preprocessor_llava_uhd
+//
+
+bool mtmd_image_preprocessor_llava_uhd::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    const clip_image_size original_size{img.nx, img.ny};
+    auto const inst = get_slice_instructions(original_size);
+    std::vector<clip_image_u8_ptr> imgs = slice_image(img, inst);
+
+    for (size_t i = 0; i < imgs.size(); ++i) {
+        // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
+        clip_image_f32_ptr res(clip_image_f32_init());
+        img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std);
+        output.entries.push_back(std::move(res));
+    }
+
+    output.grid_x = inst.grid_size.width;
+    output.grid_y = inst.grid_size.height;
+    return true;
+}
+
+mtmd_image_preprocessor_llava_uhd::slice_instructions mtmd_image_preprocessor_llava_uhd::get_slice_instructions(const clip_image_size & original_size) {
+    mtmd_image_preprocessor_llava_uhd::slice_instructions res;
+    const int patch_size      = hparams.patch_size;
+    const int slice_size      = hparams.image_size;
+    const int original_width  = original_size.width;
+    const int original_height = original_size.height;
+
+    const bool has_slices    = original_size.width > slice_size || original_size.height > slice_size;
+    const bool has_pinpoints = !hparams.image_res_candidates.empty();
+
+    if (!has_slices) {
+        // skip slicing logic
+        res.overview_size = clip_image_size{slice_size, slice_size};
+        res.refined_size  = clip_image_size{0, 0};
+        res.grid_size     = clip_image_size{0, 0};
+
+        return res;
+    }
+
+    if (has_pinpoints) {
+        // has pinpoints, use them to calculate the grid size (e.g. llava-1.6)
+        auto refine_size = select_best_resolution(
+            original_size,
+            hparams.image_res_candidates);
+        res.overview_size         = clip_image_size{slice_size, slice_size};
+        res.refined_size          = refine_size;
+        res.grid_size             = clip_image_size{0, 0};
+
+        LOG_DBG("%s: using pinpoints for slicing\n", __func__);
+        LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d\n",
+                __func__, original_width, original_height,
+                res.overview_size.width, res.overview_size.height,
+                res.refined_size.width,  res.refined_size.height);
+
+        for (int y = 0; y < refine_size.height; y += slice_size) {
+            for (int x = 0; x < refine_size.width; x += slice_size) {
+                slice_coordinates slice;
+                slice.x = x;
+                slice.y = y;
+                slice.size.width  = std::min(slice_size, refine_size.width  - x);
+                slice.size.height = std::min(slice_size, refine_size.height - y);
+                res.slices.push_back(slice);
+                LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
+                        __func__, (int)res.slices.size() - 1,
+                        slice.x, slice.y, slice.size.width, slice.size.height);
+            }
+        }
+
+        res.grid_size.height = refine_size.height / slice_size;
+        res.grid_size.width  = refine_size.width  / slice_size;
+        LOG_DBG("%s: grid size: %d x %d\n", __func__, res.grid_size.width, res.grid_size.height);
+
+        return res;
+    }
+
+    // no pinpoints, dynamically calculate the grid size (e.g. minicpmv)
+
+    auto best_size    = get_best_resize(original_size, slice_size, patch_size, !has_slices);
+    res.overview_size = best_size;
+
+    {
+        const int max_slice_nums = 9; // TODO: this is only used by minicpmv, maybe remove it
+        const float log_ratio = log((float)original_width / original_height);
+        const float ratio = (float)original_width * original_height / (slice_size * slice_size);
+        const int multiple = fmin(ceil(ratio), max_slice_nums);
+
+        auto best_grid   = get_best_grid(max_slice_nums, multiple, log_ratio);
+        auto refine_size = get_refine_size(original_size, best_grid, slice_size, patch_size, true);
+        res.grid_size    = best_grid;
+        res.refined_size = refine_size;
+
+        LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
+                __func__, original_width, original_height,
+                res.overview_size.width, res.overview_size.height,
+                res.refined_size.width, res.refined_size.height,
+                res.grid_size.width, res.grid_size.height);
+
+        int width  = refine_size.width;
+        int height = refine_size.height;
+        int grid_x = int(width  / best_grid.width);
+        int grid_y = int(height / best_grid.height);
+        for (int patches_y = 0,                    ic = 0;
+                patches_y < refine_size.height && ic < best_grid.height;
+                patches_y += grid_y,              ic += 1) {
+            for (int patches_x = 0,                   jc = 0;
+                    patches_x < refine_size.width && jc < best_grid.width;
+                    patches_x += grid_x,             jc += 1) {
+                slice_coordinates slice;
+                slice.x = patches_x;
+                slice.y = patches_y;
+                slice.size.width  = grid_x;
+                slice.size.height = grid_y;
+                res.slices.push_back(slice);
+                LOG_DBG("%s: slice %d: x=%d, y=%d, size=%dx%d\n",
+                        __func__, (int)res.slices.size() - 1,
+                        slice.x, slice.y, slice.size.width, slice.size.height);
+            }
+        }
+    }
+
+    return res;
+}
+
+std::vector<clip_image_u8_ptr> mtmd_image_preprocessor_llava_uhd::slice_image(const clip_image_u8 & img, const mtmd_image_preprocessor_llava_uhd::slice_instructions & inst, bool overview_first) {
+    std::vector<clip_image_u8_ptr> output;
+
+    // resize to overview size
+    clip_image_u8_ptr resized_img(clip_image_u8_init());
+    img_tool::resize(img, *resized_img, inst.overview_size, hparams.image_resize_algo_ov,
+                        hparams.image_pad_ov, hparams.image_pad_color_ov);
+    if (overview_first) {
+        output.push_back(std::move(resized_img));
+    }
+
+    if (inst.slices.empty()) {
+        // no slices, just return the resized image
+        if (!overview_first) {
+            output.push_back(std::move(resized_img));
+        }
+        return output;
+    }
+
+    // resize to refined size
+    clip_image_u8_ptr refined_img(clip_image_u8_init());
+    img_tool::resize(img, *refined_img, inst.refined_size, hparams.image_resize_algo_rf,
+                        hparams.image_pad_rf, hparams.image_pad_color_rf);
+
+    // create slices
+    for (const auto & slice : inst.slices) {
+        int x = slice.x;
+        int y = slice.y;
+        int w = slice.size.width;
+        int h = slice.size.height;
+
+        clip_image_u8_ptr img_slice(clip_image_u8_init());
+        img_tool::crop(*refined_img, *img_slice, x, y, w, h);
+        output.push_back(std::move(img_slice));
+    }
+
+    if (!overview_first) {
+        output.push_back(std::move(resized_img));
+    }
+
+    return output;
+}
+
+clip_image_size mtmd_image_preprocessor_llava_uhd::get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale) {
+    int width  = original_size.width;
+    int height = original_size.height;
+    if ((width * height > scale_resolution * scale_resolution) || allow_upscale) {
+        float r = static_cast<float>(width) / height;
+        height  = static_cast<int>(scale_resolution / std::sqrt(r));
+        width   = static_cast<int>(height * r);
+    }
+    clip_image_size res;
+    res.width  = ensure_divide(width,  patch_size);
+    res.height = ensure_divide(height, patch_size);
+    return res;
+}
+
+clip_image_size mtmd_image_preprocessor_llava_uhd::resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max) {
+    float scale_width  = static_cast<float>(target_max.width)  / orig.width;
+    float scale_height = static_cast<float>(target_max.height) / orig.height;
+    float scale = std::min(scale_width, scale_height);
+    return clip_image_size{
+        static_cast<int>(orig.width  * scale),
+        static_cast<int>(orig.height * scale),
+    };
+}
+
+clip_image_size mtmd_image_preprocessor_llava_uhd::select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions) {
+    clip_image_size best_fit;
+    int min_wasted_area = std::numeric_limits<int>::max();
+    int max_effective_resolution = 0;
+
+    for (const clip_image_size & candidate : possible_resolutions) {
+        auto target_size = resize_maintain_aspect_ratio(original_size, candidate);
+        int effective_resolution = std::min(
+            target_size.width * target_size.height,
+            original_size.width * original_size.height);
+        int wasted_area = (candidate.width * candidate.height) - effective_resolution;
+
+        if (effective_resolution > max_effective_resolution || (effective_resolution == max_effective_resolution && wasted_area < min_wasted_area)) {
+            max_effective_resolution = effective_resolution;
+            min_wasted_area = wasted_area;
+            best_fit = candidate;
+        }
+
+        LOG_DBG("%s: candidate: %d x %d, target: %d x %d, wasted: %d, effective: %d\n", __func__, candidate.width, candidate.height, target_size.width, target_size.height, wasted_area, effective_resolution);
+    }
+
+    return best_fit;
+}
+
+int mtmd_image_preprocessor_llava_uhd::ensure_divide(int length, int patch_size) {
+    return std::max(static_cast<int>(std::round(static_cast<float>(length) / patch_size) * patch_size), patch_size);
+}
+
+clip_image_size mtmd_image_preprocessor_llava_uhd::get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale) {
+    int width  = original_size.width;
+    int height = original_size.height;
+    int grid_x = grid.width;
+    int grid_y = grid.height;
+
+    int refine_width  = ensure_divide(width, grid_x);
+    int refine_height = ensure_divide(height, grid_y);
+
+    clip_image_size grid_size;
+    grid_size.width  = refine_width  / grid_x;
+    grid_size.height = refine_height / grid_y;
+
+    auto best_grid_size  = get_best_resize(grid_size, scale_resolution, patch_size, allow_upscale);
+    int best_grid_width  = best_grid_size.width;
+    int best_grid_height = best_grid_size.height;
+
+    clip_image_size refine_size;
+    refine_size.width  = best_grid_width  * grid_x;
+    refine_size.height = best_grid_height * grid_y;
+    return refine_size;
+}
+
+clip_image_size mtmd_image_preprocessor_llava_uhd::get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio) {
+    std::vector<int> candidate_split_grids_nums;
+    for (int i : {multiple - 1, multiple, multiple + 1}) {
+        if (i == 1 || i > max_slice_nums) {
+            continue;
+        }
+        candidate_split_grids_nums.push_back(i);
+    }
+
+    std::vector<clip_image_size> candidate_grids;
+    for (int split_grids_nums : candidate_split_grids_nums) {
+        int m = 1;
+        while (m <= split_grids_nums) {
+            if (split_grids_nums % m == 0) {
+                candidate_grids.push_back(clip_image_size{m, split_grids_nums / m});
+            }
+            ++m;
+        }
+    }
+
+    clip_image_size best_grid{1, 1};
+    float min_error = std::numeric_limits<float>::infinity();
+    for (const auto& grid : candidate_grids) {
+        float error = std::abs(log_ratio - std::log(1.0 * grid.width / grid.height));
+        if (error < min_error) {
+            best_grid = grid;
+            min_error = error;
+        }
+    }
+    return best_grid;
+}
+
+//
+// mtmd_image_preprocessor_fixed_size
+//
+
+bool mtmd_image_preprocessor_fixed_size::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    clip_image_u8 resized_image;
+    int sz = hparams.image_size;
+    img_tool::resize(img, resized_image, {sz, sz},
+                        hparams.image_resize_algo,
+                        hparams.image_resize_pad,
+                        hparams.image_pad_color);
+    clip_image_f32_ptr img_f32(clip_image_f32_init());
+    img_u8_to_f32(resized_image, *img_f32, hparams.image_mean, hparams.image_std);
+    output.entries.push_back(std::move(img_f32));
+    return true;
+}
+
+//
+// mtmd_image_preprocessor_dyn_size
+//
+
+bool mtmd_image_preprocessor_dyn_size::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    GGML_ASSERT(hparams.image_min_pixels > 0 && hparams.image_max_pixels > 0);
+    clip_image_u8 resized_image;
+    const clip_image_size original_size{img.nx, img.ny};
+    // the original pixtral model doesn't have n_merge
+    const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge;
+    const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
+        original_size,
+        hparams.patch_size * cur_merge,
+        hparams.image_min_pixels,
+        hparams.image_max_pixels);
+    img_tool::resize(img, resized_image, target_size,
+                        hparams.image_resize_algo,
+                        hparams.image_resize_pad,
+                        hparams.image_pad_color);
+    clip_image_f32_ptr img_f32(clip_image_f32_init());
+    img_u8_to_f32(resized_image, *img_f32, hparams.image_mean, hparams.image_std);
+    output.entries.push_back(std::move(img_f32));
+    return true;
+}
+
+//
+// mtmd_image_preprocessor_longest_edge
+//
+
+bool mtmd_image_preprocessor_longest_edge::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    GGML_ASSERT(hparams.image_longest_edge > 0);
+    clip_image_u8 resized_image;
+    const clip_image_size original_size{img.nx, img.ny};
+    // the original pixtral model doesn't have n_merge
+    const int cur_merge = hparams.n_merge == 0 ? 1 : hparams.n_merge;
+    const clip_image_size target_size = img_tool::calc_size_preserved_ratio(
+        original_size,
+        hparams.patch_size * cur_merge,
+        hparams.image_longest_edge);
+    img_tool::resize(img, resized_image, target_size,
+                        hparams.image_resize_algo,
+                        hparams.image_resize_pad,
+                        hparams.image_pad_color);
+    clip_image_f32_ptr img_f32(clip_image_f32_init());
+    img_u8_to_f32(resized_image, *img_f32, hparams.image_mean, hparams.image_std);
+    output.entries.push_back(std::move(img_f32));
+    return true;
+}
+
+//
+// mtmd_image_preprocessor_lfm2
+//
+
+mtmd_image_preprocessor_llava_uhd::slice_instructions mtmd_image_preprocessor_lfm2::get_slice_instructions(const clip_image_size & original_size) {
+    mtmd_image_preprocessor_llava_uhd::slice_instructions inst;
+    const int align_size = hparams.patch_size * hparams.n_merge;
+    inst.overview_size = img_tool::calc_size_preserved_ratio(
+                            original_size, align_size,
+                            hparams.image_min_pixels, hparams.image_max_pixels);
+    // tile if either dimension exceeds tile_size with tolerance
+    const bool needs_tiling = original_size.width > tile_size * max_pixels_tolerance || original_size.height > tile_size * max_pixels_tolerance;
+
+    if (!needs_tiling) {
+        inst.refined_size = clip_image_size{0, 0};
+        inst.grid_size    = clip_image_size{0, 0};
+        return inst;
+    }
+
+    const clip_image_size grid = get_grid_layout(original_size.height, original_size.width);
+
+    inst.grid_size    = grid;
+    inst.refined_size = clip_image_size{tile_size * grid.width, tile_size * grid.height};
+
+    LOG_DBG("%s: original size: %d x %d, overview size: %d x %d, refined size: %d x %d, grid size: %d x %d\n",
+            __func__,
+            original_size.width, original_size.height,
+            inst.overview_size.width, inst.overview_size.height,
+            inst.refined_size.width, inst.refined_size.height,
+            grid.width, grid.height);
+
+    for (int row = 0; row < grid.height; row++) {
+        for (int col = 0; col < grid.width; col++) {
+            mtmd_image_preprocessor_llava_uhd::slice_coordinates slice;
+            slice.x    = col * tile_size;
+            slice.y    = row * tile_size;
+            slice.size = clip_image_size{tile_size, tile_size};
+            inst.slices.push_back(slice);
+            LOG_DBG("%s: slice %d: x=%d, y=%d, size=%d x %d\n",
+                    __func__, (int)inst.slices.size() - 1,
+                    slice.x, slice.y, slice.size.width, slice.size.height);
+        }
+    }
+
+    return inst;
+}
+
+clip_image_size mtmd_image_preprocessor_lfm2::find_closest_aspect_ratio(
+        float aspect_ratio,
+        const std::vector<clip_image_size> & target_ratios,
+        int width, int height) {
+    float best_ratio_diff = std::numeric_limits<float>::max();
+    clip_image_size best_ratio = {1, 1};
+    const float area = static_cast<float>(width * height);
+
+    for (const auto & ratio : target_ratios) {
+        const float target_aspect_ratio = static_cast<float>(ratio.width) / ratio.height;
+        const float ratio_diff = std::abs(aspect_ratio - target_aspect_ratio);
+        if (ratio_diff < best_ratio_diff) {
+            best_ratio_diff = ratio_diff;
+            best_ratio = ratio;
+        } else if (ratio_diff == best_ratio_diff) {
+            const float target_area = static_cast<float>(tile_size * tile_size * ratio.width * ratio.height);
+            if (area > 0.5f * target_area) {
+                best_ratio = ratio;
+            }
+        }
+    }
+    return best_ratio;
+}
+
+std::vector<clip_image_size> mtmd_image_preprocessor_lfm2::get_target_ratios() {
+    std::vector<clip_image_size> ratios;
+    for (int n = min_tiles; n <= max_tiles; n++) {
+        for (int w = 1; w <= n; w++) {
+            for (int h = 1; h <= n; h++) {
+                if (w * h >= min_tiles && w * h <= max_tiles) {
+                    bool found = false;
+                    for (const auto & r : ratios) {
+                        if (r.width == w && r.height == h) {
+                            found = true;
+                            break;
+                        }
+                    }
+                    if (!found) {
+                        ratios.push_back({w, h});
+                    }
+                }
+            }
+        }
+    }
+    std::sort(ratios.begin(), ratios.end(), [](const clip_image_size & a, const clip_image_size & b) {
+        return a.width * a.height < b.width * b.height;
+    });
+    return ratios;
+}
+
+clip_image_size mtmd_image_preprocessor_lfm2::get_grid_layout(int height, int width) {
+    const float aspect_ratio = static_cast<float>(width) / height;
+    const auto ratios = get_target_ratios();
+    return find_closest_aspect_ratio(aspect_ratio, ratios, width, height);
+}
+
+//
+// mtmd_image_preprocessor_idefics3
+//
+
+bool mtmd_image_preprocessor_idefics3::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    // The refined size has two steps:
+    // 1. Resize w/ aspect-ratio preserving such that the longer side is
+    //      the preprocessor longest size
+    // 2. Resize w/out preserving aspect ratio such that both sides are
+    //      multiples of image_size (always rounding up)
+    //
+    // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
+    const clip_image_size original_size{img.nx, img.ny};
+    const clip_image_size refined_size = img_tool::calc_size_preserved_ratio(
+        original_size, hparams.image_size, hparams.image_longest_edge);
+    // LOG_INF("%s: original size: %d x %d, refined size: %d x %d\n",
+    //         __func__, original_size.width, original_size.height,
+    //         refined_size.width, refined_size.height);
+
+    mtmd_image_preprocessor_llava_uhd::slice_instructions instructions;
+    instructions.overview_size = clip_image_size{hparams.image_size, hparams.image_size};
+    instructions.refined_size = refined_size;
+    instructions.grid_size = clip_image_size{
+        static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / hparams.image_size)),
+        static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / hparams.image_size)),
+    };
+    for (int y = 0; y < refined_size.height; y += hparams.image_size) {
+        for (int x = 0; x < refined_size.width; x += hparams.image_size) {
+            // LOG_INF("%s: adding slice at x=%d, y=%d\n", __func__, x, y);
+            instructions.slices.push_back(mtmd_image_preprocessor_llava_uhd::slice_coordinates{
+                /* x    */x,
+                /* y    */y,
+                /* size */clip_image_size{
+                    std::min(hparams.image_size, refined_size.width - x),
+                    std::min(hparams.image_size, refined_size.height - y)
+                }
+            });
+        }
+    }
+    auto imgs = slice_image(img, instructions);
+
+    // cast and normalize to f32
+    for (size_t i = 0; i < imgs.size(); ++i) {
+        // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
+        clip_image_f32_ptr res(clip_image_f32_init());
+        img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std);
+        output.entries.push_back(std::move(res));
+    }
+
+    output.grid_x = instructions.grid_size.width;
+    output.grid_y = instructions.grid_size.height;
+    return true;
+}
+
+//
+// mtmd_image_preprocessor_internvl
+//
+
+bool mtmd_image_preprocessor_internvl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    GGML_ASSERT(!hparams.image_res_candidates.empty());
+    const clip_image_size original_size{img.nx, img.ny};
+    auto const inst = get_slice_instructions(original_size);
+    std::vector<clip_image_u8_ptr> imgs = slice_image(img, inst, false);
+
+    for (size_t i = 0; i < imgs.size(); ++i) {
+        clip_image_f32_ptr res(clip_image_f32_init());
+        img_u8_to_f32(*imgs[i], *res, hparams.image_mean, hparams.image_std);
+        output.entries.push_back(std::move(res));
+    }
+    return true;
+}
+
+//
+// mtmd_image_preprocessor_deepseekocr
+//
+
+bool mtmd_image_preprocessor_deepseekocr::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    const std::vector native_resolutions = {
+        /*512 tiny , 640 small, */ 1024 /* base */, 1280 /* large */
+    };
+    // original image size
+    const clip_image_size original_size{img.nx, img.ny};
+    const int orig_w = original_size.width;
+    const int orig_h = original_size.height;
+    const int orig_area = orig_h * orig_w;
+
+    size_t mode_i = 0;
+    int min_diff = orig_area;
+
+    for (size_t i = 0; i < native_resolutions.size(); i++) {
+        int r = native_resolutions[i];
+        if (std::abs(orig_area - r * r) < min_diff) {
+            mode_i = i;
+            min_diff = std::abs(orig_area - r * r);
+        }
+    }
+
+    /* Native Resolution (Base/Large) */
+    const int image_size = native_resolutions[mode_i];
+
+    // scaled and padded image
+    clip_image_u8_ptr scaled_img(clip_image_u8_init());
+    img_tool::resize(img, *scaled_img, clip_image_size{image_size, image_size}, hparams.image_resize_algo);
+
+    clip_image_f32_ptr res(clip_image_f32_init());
+    img_u8_to_f32(*scaled_img, *res, hparams.image_mean, hparams.image_std);
+    output.entries.push_back(std::move(res));
+
+    output.grid_x = 1;
+    output.grid_y = 1;
+    return true;
+}
+
+//
+// mtmd_image_preprocessor_youtuvl
+//
+
+bool mtmd_image_preprocessor_youtuvl::preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) {
+    const int patch_size = hparams.patch_size;   // typically 16
+    const int merge_size = hparams.n_merge;      // typically 2
+    const int align_size = patch_size * merge_size;  // 32
+
+    const int max_num_patches = hparams.image_max_pixels > 0 ?
+        hparams.image_max_pixels / (patch_size * patch_size) : 256;
+
+    // Linear search for optimal scale to fit within max_num_patches
+    float scale = 1.0f;
+    int target_height = img.ny;
+    int target_width  = img.nx;
+
+    auto get_scaled_image_size = [align_size](float scale, int size) -> int {
+        float scaled_size = size * scale;
+        // Round up to nearest multiple of align_size
+        int aligned = static_cast<int>(std::ceil(scaled_size / align_size)) * align_size;
+        // Ensure at least one patch
+        return std::max(align_size, aligned);
+    };
+
+    // Linear search with 0.02 step size
+    while (scale > 0.0f) {
+        target_height = get_scaled_image_size(scale, img.ny);
+        target_width  = get_scaled_image_size(scale, img.nx);
+
+        int num_patches_h = target_height / patch_size;
+        int num_patches_w = target_width / patch_size;
+        int num_patches = num_patches_h * num_patches_w;
+
+        if (num_patches > max_num_patches) {
+            scale -= 0.02f;
+        } else {
+            break;
+        }
+    }
+
+    clip_image_size new_size = {target_width, target_height};
+
+    // Resize the image
+    clip_image_u8 resized;
+    img_tool::resize(img, resized, new_size, hparams.image_resize_algo, hparams.image_resize_pad);
+
+    // Normalize to float32
+    clip_image_f32_ptr img_f32(clip_image_f32_init());
+    img_u8_to_f32(resized, *img_f32, hparams.image_mean, hparams.image_std);
+    // Add to results
+    output.entries.push_back(std::move(img_f32));
+    return true;
+}
diff --git a/tools/mtmd/mtmd-image.h b/tools/mtmd/mtmd-image.h

new file mode 100644 (file)

index 0000000..065b937
--- /dev/null
+++ b/tools/mtmd/mtmd-image.h
@@ -0,0 +1,150 @@
+#pragma once
+
+#include "ggml.h"
+#include "clip-model.h"
+
+#include <vector>
+#include <string>
+
+#define MTMD_INTERNAL_HEADER
+
+// base class, models must inherit from this class
+struct mtmd_image_preprocessor {
+    const clip_hparams & hparams;
+
+    mtmd_image_preprocessor(const clip_ctx * ctx): hparams(*clip_get_hparams(ctx)) {}
+
+    virtual ~mtmd_image_preprocessor() = default;
+    virtual bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) = 0;
+
+    void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst, const float mean[3], const float std[3]);
+    void img_u8_to_f32(const clip_image_u8 & src, clip_image_f32 & dst);
+};
+
+/**
+ * implementation of LLaVA-UHD:
+ *  - https://arxiv.org/pdf/2403.11703
+ *  - https://github.com/thunlp/LLaVA-UHD
+ *  - https://github.com/thunlp/LLaVA-UHD/blob/302301bc2175f7e717fb8548516188e89f649753/llava_uhd/train/llava-uhd/slice_logic.py#L118
+ *
+ * overview:
+ *   - an image always have a single overview (downscaled image)
+ *   - an image can have 0 or multiple slices, depending on the image size
+ *   - each slice can then be considered as a separate image
+ *
+ * note: the term "slice" and "tile" are used interchangeably
+ *
+ * for example:
+ *
+ * [overview] --> [slice 1] --> [slice 2]
+ *           |                |
+ *           +--> [slice 3] --> [slice 4]
+ */
+struct mtmd_image_preprocessor_llava_uhd : mtmd_image_preprocessor {
+    mtmd_image_preprocessor_llava_uhd(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
+    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+
+    struct slice_coordinates {
+        int x;
+        int y;
+        clip_image_size size;
+    };
+
+    struct slice_instructions {
+        clip_image_size overview_size; // size of downscaled image
+        clip_image_size refined_size;  // size of image right before slicing (must be multiple of slice size)
+        clip_image_size grid_size;     // grid_size.width * grid_size.height = number of slices
+        std::vector<slice_coordinates> slices;
+    };
+
+    // LFM2 override this function to implement its custom slicing logic
+    virtual slice_instructions get_slice_instructions(const clip_image_size & original_size);
+
+    std::vector<clip_image_u8_ptr> slice_image(const clip_image_u8 & img, const slice_instructions & inst, bool overview_first = true);
+
+private:
+    clip_image_size get_best_resize(const clip_image_size & original_size, int scale_resolution, int patch_size, bool allow_upscale = false);
+
+    clip_image_size resize_maintain_aspect_ratio(const clip_image_size & orig, const clip_image_size & target_max);
+
+    /**
+     * Selects the best resolution from a list of possible resolutions based on the original size.
+     *
+     * For example, when given a list of resolutions:
+     *  - 100x100
+     *  - 200x100
+     *  - 100x200
+     *  - 200x200
+     *
+     * And an input image of size 111x200, then 100x200 is the best fit (least wasted resolution).
+     *
+     * @param original_size The original size of the image
+     * @param possible_resolutions A list of possible resolutions
+     * @return The best fit resolution
+     */
+    clip_image_size select_best_resolution(const clip_image_size & original_size, const std::vector<clip_image_size> & possible_resolutions);
+    int ensure_divide(int length, int patch_size);
+    clip_image_size get_refine_size(const clip_image_size & original_size, const clip_image_size & grid, int scale_resolution, int patch_size, bool allow_upscale = false);
+    clip_image_size get_best_grid(const int max_slice_nums, const int multiple, const float log_ratio);
+};
+
+// downscale or upscale the input image to fixed size
+struct mtmd_image_preprocessor_fixed_size : mtmd_image_preprocessor {
+    mtmd_image_preprocessor_fixed_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
+    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+};
+
+// resize image to multiple of patch_size*n_merge, while preserving aspect ratio
+// if image_resize_pad is true, the resized image will be padded, otherwise it will be either stretched or center-cropped depending on image_resize_pad
+// this is used by models with native support for dynamic image size, for example: Qwen-VL, Pixtral, Kimi-VL, etc
+struct mtmd_image_preprocessor_dyn_size : mtmd_image_preprocessor {
+    mtmd_image_preprocessor_dyn_size(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
+    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+};
+
+// similar to mtmd_image_preprocessor_dyn_size, but resize the image to have longest edge equal to hparams.image_longest_edge, while preserving aspect ratio
+struct mtmd_image_preprocessor_longest_edge : mtmd_image_preprocessor {
+    mtmd_image_preprocessor_longest_edge(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
+    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+};
+
+// custom llava-uhd slicing logic for LFM2
+// ref: https://github.com/huggingface/transformers/blob/v5.1.0/src/transformers/models/lfm2_vl/image_processing_lfm2_vl_fast.py
+struct mtmd_image_preprocessor_lfm2 : mtmd_image_preprocessor_llava_uhd {
+    // ref: https://huggingface.co/LiquidAI/LFM2.5-VL-1.6B/blob/main/processor_config.json
+    static constexpr int   min_tiles            = 2;
+    static constexpr int   max_tiles            = 10;
+    static constexpr float max_pixels_tolerance = 2.0f;
+    static constexpr int   tile_size            = 512;
+
+    using mtmd_image_preprocessor_llava_uhd::mtmd_image_preprocessor_llava_uhd;
+    slice_instructions get_slice_instructions(const clip_image_size & original_size) override;
+
+private:
+    clip_image_size find_closest_aspect_ratio(
+            float aspect_ratio,
+            const std::vector<clip_image_size> & target_ratios,
+            int width, int height);
+    std::vector<clip_image_size> get_target_ratios();
+    clip_image_size get_grid_layout(int height, int width);
+};
+
+struct mtmd_image_preprocessor_idefics3 : mtmd_image_preprocessor_llava_uhd {
+    mtmd_image_preprocessor_idefics3(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
+    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+};
+
+struct mtmd_image_preprocessor_internvl : mtmd_image_preprocessor_llava_uhd {
+    mtmd_image_preprocessor_internvl(const clip_ctx * ctx) : mtmd_image_preprocessor_llava_uhd(ctx) {}
+    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+};
+
+struct mtmd_image_preprocessor_deepseekocr : mtmd_image_preprocessor {
+    mtmd_image_preprocessor_deepseekocr(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
+    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+};
+
+struct mtmd_image_preprocessor_youtuvl : mtmd_image_preprocessor {
+    mtmd_image_preprocessor_youtuvl(const clip_ctx * ctx) : mtmd_image_preprocessor(ctx) {}
+    bool preprocess(const clip_image_u8 & img, clip_image_f32_batch & output) override;
+};
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp

index 456ce7b73c82556ed45164b802010e9247a3ef41..d078120f761882a2a7aa1a0cbdb947abb5b1b14f 100644 (file)
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -2,6 +2,7 @@
  #include "clip-impl.h"
  #include "mtmd.h"
  #include "mtmd-audio.h"
+#include "mtmd-image.h"
  #include "debug/mtmd-debug.h"
  
  #include "llama.h"
@@ -138,7 +139,7 @@ struct mtmd_context {
  
      // for llava-uhd style models, we need special tokens in-between slices
      // minicpmv calls them "slices", llama 4 calls them "tiles"
-    mtmd_slice_tmpl slice_tmpl    = MTMD_SLICE_TMPL_NONE;
+    mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
      std::vector<llama_token> tok_ov_img_start;  // overview image
      std::vector<llama_token> tok_ov_img_end;    // overview image
      std::vector<llama_token> tok_slices_start;  // start of all slices
@@ -147,13 +148,14 @@ struct mtmd_context {
      std::vector<llama_token> tok_sli_img_end;   // single slice end
      std::vector<llama_token> tok_sli_img_mid;   // between 2 slices
      std::vector<llama_token> tok_row_end;       // end of row
-    bool        tok_row_end_trail = false;
-    bool        ov_img_first      = false;
+    bool tok_row_end_trail = false;
+    bool ov_img_first      = false;
  
      // string template for slice image delimiters with row/col (idefics3)
      std::string sli_img_start_tmpl;
  
      std::unique_ptr<mtmd_audio_preprocessor> audio_preproc;
+    std::unique_ptr<mtmd_image_preprocessor> image_preproc;
  
      // TODO @ngxson : add timings
  
@@ -221,123 +223,193 @@ struct mtmd_context {
  
      void init_vision() {
          GGML_ASSERT(ctx_v != nullptr);
+        image_preproc.reset();
  
          projector_type proj = clip_get_projector_type(ctx_v);
-        int minicpmv_version = clip_is_minicpmv(ctx_v);
-        if (minicpmv_version == 2) {
-            // minicpmv 2.5 format:
-            // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
-            slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_5;
-            tok_ov_img_start  = {lookup_token("<image>")};
-            tok_ov_img_end    = {lookup_token("</image>")};
-            tok_slices_start  = {lookup_token("<slice>")};
-            tok_slices_end    = {lookup_token("</slice>")};
-            tok_sli_img_start = tok_ov_img_start;
-            tok_sli_img_end   = tok_ov_img_end;
-            tok_row_end       = {lookup_token("\n")};
-            tok_row_end_trail = false; // no trailing end-of-row token
-            ov_img_first      = true;
-
-        } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
-            // minicpmv 2.6 format:
-            // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
-            slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_6;
-            tok_ov_img_start  = {lookup_token("<image>")};
-            tok_ov_img_end    = {lookup_token("</image>")};
-            tok_sli_img_start = {lookup_token("<slice>")};
-            tok_sli_img_end   = {lookup_token("</slice>")};
-            tok_row_end       = {lookup_token("\n")};
-            tok_row_end_trail = false; // no trailing end-of-row token
-            ov_img_first      = true;
-
-        } else if (minicpmv_version != 0) {
-            GGML_ASSERT(false && "unsupported minicpmv version");
-        } else if (proj == PROJECTOR_TYPE_LLAMA4) {
-            // llama 4 format:
-            // <|image_start|>
-            //     (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
-            //     (slice) <|tile_x_separator|> (slice) <|tile_x_separator|> ... <|tile_y_separator|>
-            //     ... <|tile_y_separator|>   <-- trailing end-of-row token
-            // <|image|> (overview)           <-- overview image is last
-            // <|image_end|>
-            slice_tmpl        = MTMD_SLICE_TMPL_LLAMA4;
-            tok_ov_img_start  = {lookup_token("<|image|>")};
-            tok_sli_img_mid   = {lookup_token("<|tile_x_separator|>")};
-            tok_row_end       = {lookup_token("<|tile_y_separator|>")};
-            tok_row_end_trail = true; // add trailing end-of-row token
-            ov_img_first      = false; // overview image is last
-        }
  
-        // set boi/eoi
-        if (proj == PROJECTOR_TYPE_GEMMA3 || proj == PROJECTOR_TYPE_GEMMA3NV) {
-            // <start_of_image> ... (image embeddings) ... <end_of_image>
-            img_beg = "<start_of_image>";
-            img_end = "<end_of_image>";
-
-        } else if (proj == PROJECTOR_TYPE_IDEFICS3) {
-            // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
-            slice_tmpl         = MTMD_SLICE_TMPL_IDEFICS3;
-            tok_ov_img_start   = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
-            tok_ov_img_end     = {lookup_token("<fake_token_around_image>")};
-            tok_row_end        = {lookup_token("\n")};
-            sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
-
-        } else if (proj == PROJECTOR_TYPE_PIXTRAL) {
-            // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
-            img_end = "[IMG_END]";
-
-        } else if (proj == PROJECTOR_TYPE_QWEN2VL || proj == PROJECTOR_TYPE_QWEN25VL || proj == PROJECTOR_TYPE_QWEN3VL || proj == PROJECTOR_TYPE_YOUTUVL) {
-            // <|vision_start|> ... (image embeddings) ... <|vision_end|>
-            img_beg = "<|vision_start|>";
-            img_end = "<|vision_end|>";
-
-        } else if (proj == PROJECTOR_TYPE_PHI4) {
-            // Phi-4 uses media marker insertion only. Keep image boundary text empty.
-
-        } else if (proj == PROJECTOR_TYPE_LLAMA4) {
-            // (more details in mtmd_context constructor)
-            img_beg = "<|image_start|>";
-            img_end = "<|image_end|>";
-            LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
-                    "    https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
-
-        } else if (proj == PROJECTOR_TYPE_INTERNVL) {
-            // <img> ... (image embeddings) ... </img>
-            img_beg = "<img>";
-            img_end = "</img>";
-
-        } else if (proj == PROJECTOR_TYPE_LIGHTONOCR) {
-            // <|im_start|> ... (image embeddings) ... <|im_end|>
-            img_beg = "<|im_start|>";
-            img_end = "<|im_end|>";
-
-        } else if (proj == PROJECTOR_TYPE_LFM2) {
-            // multi-tile:
-            //   <|image_start|>
-            //     <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
-            //     <|img_thumbnail|> (thumbnail)
-            //   <|image_end|>
-            // single-tile:
-            //   <|image_start|> (image) <|image_end|>
-            img_beg            = "<|image_start|>";
-            img_end            = "<|image_end|>";
-            slice_tmpl         = MTMD_SLICE_TMPL_LFM2;
-            sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
-            tok_ov_img_start   = {lookup_token("<|img_thumbnail|>")};
-            ov_img_first       = false;
-        } else if (proj == PROJECTOR_TYPE_GLM4V) {
-            img_beg = "<|begin_of_image|>";
-            img_end = "<|end_of_image|>";
-
-        } else if (proj == PROJECTOR_TYPE_PADDLEOCR) {
-            // <|IMAGE_START|> ... (image embeddings) ... <|IMAGE_END|>
-            img_beg = "<|IMAGE_START|>";
-            img_end = "<|IMAGE_END|>";
+        switch (proj) {
+            case PROJECTOR_TYPE_MLP:
+            case PROJECTOR_TYPE_MLP_NORM:
+            case PROJECTOR_TYPE_LDP:
+            case PROJECTOR_TYPE_LDPV2:
+            case PROJECTOR_TYPE_COGVLM:
+            case PROJECTOR_TYPE_JANUS_PRO:
+            case PROJECTOR_TYPE_GLM_EDGE:
+                {
+                    bool has_pinpoints = !clip_get_hparams(ctx_v)->image_res_candidates.empty();
+                    if (has_pinpoints) {
+                        image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
+                    } else {
+                        image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
+                    }
+                } break;
+            case PROJECTOR_TYPE_MINICPMV:
+                {
+                    int minicpmv_version = clip_is_minicpmv(ctx_v);
+                    if (minicpmv_version == 2) {
+                        // minicpmv 2.5 format:
+                        // <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
+                        slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_5;
+                        tok_ov_img_start  = {lookup_token("<image>")};
+                        tok_ov_img_end    = {lookup_token("</image>")};
+                        tok_slices_start  = {lookup_token("<slice>")};
+                        tok_slices_end    = {lookup_token("</slice>")};
+                        tok_sli_img_start = tok_ov_img_start;
+                        tok_sli_img_end   = tok_ov_img_end;
+                        tok_row_end       = {lookup_token("\n")};
+                        tok_row_end_trail = false; // no trailing end-of-row token
+                        ov_img_first      = true;
+
+                    } else if (minicpmv_version == 3 || minicpmv_version == 4 || minicpmv_version == 5 || minicpmv_version == 6 || minicpmv_version == 100045) {
+                        // minicpmv 2.6 format:
+                        // <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
+                        slice_tmpl        = MTMD_SLICE_TMPL_MINICPMV_2_6;
+                        tok_ov_img_start  = {lookup_token("<image>")};
+                        tok_ov_img_end    = {lookup_token("</image>")};
+                        tok_sli_img_start = {lookup_token("<slice>")};
+                        tok_sli_img_end   = {lookup_token("</slice>")};
+                        tok_row_end       = {lookup_token("\n")};
+                        tok_row_end_trail = false; // no trailing end-of-row token
+                        ov_img_first      = true;
+
+                    } else if (minicpmv_version != 0) {
+                        throw std::runtime_error(string_format("unsupported minicpmv version: %d\n", minicpmv_version));
+                    }
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
+                } break;
+            case PROJECTOR_TYPE_QWEN2VL:
+            case PROJECTOR_TYPE_QWEN25VL:
+            case PROJECTOR_TYPE_QWEN3VL:
+                {
+                    // <|vision_start|> ... (image embeddings) ... <|vision_end|>
+                    img_beg = "<|vision_start|>";
+                    img_end = "<|vision_end|>";
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
+                } break;
+            case PROJECTOR_TYPE_YOUTUVL:
+                {
+                    // <|vision_start|> ... (image embeddings) ... <|vision_end|>
+                    img_beg = "<|vision_start|>";
+                    img_end = "<|vision_end|>";
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_youtuvl>(ctx_v);
+                } break;
+            case PROJECTOR_TYPE_GEMMA3:
+            case PROJECTOR_TYPE_GEMMA3NV:
+                {
+                    // <start_of_image> ... (image embeddings) ... <end_of_image>
+                    img_beg = "<start_of_image>";
+                    img_end = "<end_of_image>";
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
+                } break;
+            case PROJECTOR_TYPE_IDEFICS3:
+                {
+                    // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
+                    slice_tmpl         = MTMD_SLICE_TMPL_IDEFICS3;
+                    tok_ov_img_start   = {lookup_token("\n\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
+                    tok_ov_img_end     = {lookup_token("<fake_token_around_image>")};
+                    tok_row_end        = {lookup_token("\n")};
+                    sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_idefics3>(ctx_v);
+                } break;
+            case PROJECTOR_TYPE_PIXTRAL:
+                {
+                    // https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
+                    img_end = "[IMG_END]";
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
+                } break;
+            case PROJECTOR_TYPE_PHI4:
+                {
+                    // Phi-4 uses media marker insertion only. Keep image boundary text empty.
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
+                } break;
+            case PROJECTOR_TYPE_LLAMA4:
+                {
+                    // (more details in mtmd_context constructor)
+                    img_beg = "<|image_start|>";
+                    img_end = "<|image_end|>";
+                    LOG_WRN("%s: llama 4 vision is known to have degraded quality:\n"
+                            "    https://github.com/ggml-org/llama.cpp/pull/13282\n", __func__);
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_llava_uhd>(ctx_v);
+                } break;
+            case PROJECTOR_TYPE_INTERNVL:
+                {
+                    // <img> ... (image embeddings) ... </img>
+                    img_beg = "<img>";
+                    img_end = "</img>";
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_internvl>(ctx_v);
+                } break;
+            case PROJECTOR_TYPE_KIMIVL:
+                {
+                    // <|media_start|> ... (image embeddings) ... <|media_end|>
+                    img_beg = "<|media_start|>";
+                    img_end = "<|media_end|>";
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
+                } break;
+            case PROJECTOR_TYPE_KIMIK25:
+                {
+                    // <|media_begin|> ... (image embeddings) ... <|media_end|>
+                    img_beg = "<|media_begin|>";
+                    img_end = "<|media_end|>";
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
+                } break;
+            case PROJECTOR_TYPE_LIGHTONOCR:
+                {
+                    // <|im_start|> ... (image embeddings) ... <|im_end|>
+                    img_beg = "<|im_start|>";
+                    img_end = "<|im_end|>";
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_longest_edge>(ctx_v);
+                } break;
+            case PROJECTOR_TYPE_NEMOTRON_V2_VL:
+                {
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_fixed_size>(ctx_v);
+                } break;
+            case PROJECTOR_TYPE_LFM2:
+                {
+                    // multi-tile:
+                    //   <|image_start|>
+                    //     <|img_row_1_col_1|> (tile) <|img_row_1_col_2|> (tile) ...
+                    //     <|img_thumbnail|> (thumbnail)
+                    //   <|image_end|>
+                    // single-tile:
+                    //   <|image_start|> (image) <|image_end|>
+                    img_beg            = "<|image_start|>";
+                    img_end            = "<|image_end|>";
+                    slice_tmpl         = MTMD_SLICE_TMPL_LFM2;
+                    sli_img_start_tmpl = "<|img_row_%d_col_%d|>";
+                    tok_ov_img_start   = {lookup_token("<|img_thumbnail|>")};
+                    ov_img_first       = false;
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_lfm2>(ctx_v);
+                } break;
+            case PROJECTOR_TYPE_GLM4V:
+                {
+                    // <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
+                    img_beg = "<|begin_of_image|>";
+                    img_end = "<|end_of_image|>";
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
+                } break;
+            case PROJECTOR_TYPE_PADDLEOCR:
+                {
+                    // <|IMAGE_START|> ... (image embeddings) ... <|IMAGE_END|>
+                    img_beg = "<|IMAGE_START|>";
+                    img_end = "<|IMAGE_END|>";
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_dyn_size>(ctx_v);
+                } break;
+            case PROJECTOR_TYPE_DEEPSEEKOCR:
+                {
+                    img_end = "\n"; // prevent empty batch on llama-server
+                    image_preproc = std::make_unique<mtmd_image_preprocessor_deepseekocr>(ctx_v);
+                } break;
+            default:
+                throw std::runtime_error(string_format("%s: unexpected vision projector type %d\n", __func__, proj));
          }
+
+        GGML_ASSERT(image_preproc != nullptr);
      }
  
      void init_audio() {
          GGML_ASSERT(ctx_a != nullptr);
+        audio_preproc.reset();
+
          projector_type proj = clip_get_projector_type(ctx_a);
  
          LOG_WRN("%s: audio input is in experimental stage and may have reduced quality:\n"
@@ -347,36 +419,40 @@ struct mtmd_context {
          switch (proj) {
              case PROJECTOR_TYPE_QWEN2A:
              case PROJECTOR_TYPE_QWEN25O:
-            case PROJECTOR_TYPE_ULTRAVOX:
+                {
+                    // <|audio_bos|> ... (embeddings) ... <|audio_eos|>
+                    aud_beg = "<|audio_bos|>";
+                    aud_end = "<|audio_eos|>";
+                    audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
+                } break;
              case PROJECTOR_TYPE_VOXTRAL:
-            case PROJECTOR_TYPE_GLMA:
+                {
+                    // [BEGIN_AUDIO] ... (embeddings) ...
+                    aud_beg = "[BEGIN_AUDIO]";
+                    audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
+                } break;
              case PROJECTOR_TYPE_MUSIC_FLAMINGO:
-                audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
-                break;
+                {
+                    // <sound> ... (embeddings) ...
+                    aud_beg = "<sound>";
+                    audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
+                } break;
+            case PROJECTOR_TYPE_ULTRAVOX:
+            case PROJECTOR_TYPE_GLMA:
+                {
+                    audio_preproc = std::make_unique<mtmd_audio_preprocessor_whisper>(ctx_a);
+                } break;
              case PROJECTOR_TYPE_LFM2A:
-                audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
-                break;
+                {
+                    audio_preproc = std::make_unique<mtmd_audio_preprocessor_conformer>(ctx_a);
+                } break;
              default:
-                GGML_ABORT("unsupported audio projector type");
+                throw std::runtime_error(string_format("%s: unexpected audio projector type %d\n", __func__, proj));
          }
  
          // initialize audio preprocessor
+        GGML_ASSERT(audio_preproc != nullptr);
          audio_preproc->initialize();
-
-        // set special tokens
-        if (proj == PROJECTOR_TYPE_QWEN2A) {
-            // <|audio_bos|> ... (embeddings) ... <|audio_eos|>
-            aud_beg = "<|audio_bos|>";
-            aud_end = "<|audio_eos|>";
-
-        } else if (proj == PROJECTOR_TYPE_ULTRAVOX) {
-            // [BEGIN_AUDIO] ... (embeddings) ...
-            aud_beg = "[BEGIN_AUDIO]";
-
-        } else if (proj == PROJECTOR_TYPE_MUSIC_FLAMINGO) {
-            // <sound> ... (embeddings) ...
-            aud_beg = "<sound>";
-        }
      }
  
      // get clip ctx based on chunk type
@@ -573,8 +649,9 @@ struct mtmd_tokenizer {
              std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
  
              // preprocess image
+            GGML_ASSERT(ctx->image_preproc != nullptr);
              clip_image_f32_batch batch_f32;
-            bool ok = clip_image_preprocess(ctx->ctx_v, img_u8.get(), &batch_f32);
+            bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32);
              if (!ok) {
                  LOG_ERR("Unable to preprocess image\n");
                  return 2;
@@ -1225,7 +1302,8 @@ void mtmd_debug_preprocess_image(mtmd_context * ctx, const std::vector<uint8_t>
      img_u8.ny = ny;
      img_u8.buf = rgb_values;
      clip_image_f32_batch batch_f32;
-    bool ok = clip_image_preprocess(ctx->ctx_v, &img_u8, &batch_f32);
+    GGML_ASSERT(ctx->image_preproc != nullptr);
+    bool ok = ctx->image_preproc->preprocess(img_u8, batch_f32);
      if (!ok) {
          LOG_ERR("%s: failed to preprocess image\n", __func__);
          return;
author	Xuan-Son Nguyen <redacted>
	Thu, 26 Mar 2026 18:49:20 +0000 (19:49 +0100)
committer	GitHub <redacted>
	Thu, 26 Mar 2026 18:49:20 +0000 (19:49 +0100)
tools/mtmd/CMakeLists.txt		patch \| blob \| history
tools/mtmd/clip-impl.h		patch \| blob \| history
tools/mtmd/clip-model.h		patch \| blob \| history
tools/mtmd/clip.cpp		patch \| blob \| history
tools/mtmd/clip.h		patch \| blob \| history
tools/mtmd/mtmd-image.cpp	[new file with mode: 0644]	patch \| blob
tools/mtmd/mtmd-image.h	[new file with mode: 0644]	patch \| blob
tools/mtmd/mtmd.cpp		patch \| blob \| history