mtmd: allow QwenVL to process larger image by default (#17020)

author Xuan-Son Nguyen <redacted>

Wed, 5 Nov 2025 13:26:49 +0000 (14:26 +0100)

committer GitHub <redacted>

Wed, 5 Nov 2025 13:26:49 +0000 (14:26 +0100)
author Xuan-Son Nguyen <redacted>
Wed, 5 Nov 2025 13:26:49 +0000 (14:26 +0100)
committer GitHub <redacted>
Wed, 5 Nov 2025 13:26:49 +0000 (14:26 +0100)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp

index 67a8a470c8302931fb1510776cca93473413a8de..16781fb19523f1d3b3e9ef70be8bdd28d4191272 100644 (file)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -2791,14 +2791,8 @@ struct clip_model_loader {
                          get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
                          get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern, model.proj_type == PROJECTOR_TYPE_QWEN25VL); // only 2.5 requires it
                          // ref: https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
-                        // the actual max limit is 12845056/14/14/2/2/4 = 4096 tokens
-                        // but we set a lower value to avoid OOM
-                        // TODO: make it configurable by user
-                        // TODO (2): bbox coordinates become inaccurate with small number of tokens,
-                        //           therefore we need to increase the min_tokens
-                        //           see: https://github.com/ggml-org/llama.cpp/issues/16842#issuecomment-3475144858
-                        hparams.set_limit_image_tokens(8, 2048);
-                        hparams.set_warmup_n_tokens(256); // avoid OOM on warmup
+                        hparams.set_limit_image_tokens(8, 4096);
+                        hparams.set_warmup_n_tokens(46*46); // avoid OOM on warmup
                          const int warn_min_pixels = 1024 * hparams.n_merge * hparams.n_merge * hparams.patch_size * hparams.patch_size;
                          if (hparams.image_min_pixels < warn_min_pixels) {
                              LOG_WRN("%s: Qwen-VL models require at minimum 1024 image tokens to function correctly on grounding tasks\n", __func__);
@@ -4814,7 +4808,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
          case PROJECTOR_TYPE_QWEN2VL:
          case PROJECTOR_TYPE_QWEN3VL:
              {
-                const int merge_ratio = 2;
+                const int merge_ratio = hparams.n_merge;
                  const int pw = image_size_width  / patch_size;
                  const int ph = image_size_height / patch_size;
                  std::vector<int> positions(n_pos * 4);
author	Xuan-Son Nguyen <redacted>
	Wed, 5 Nov 2025 13:26:49 +0000 (14:26 +0100)
committer	GitHub <redacted>
	Wed, 5 Nov 2025 13:26:49 +0000 (14:26 +0100)