clip : cap max image size 1024 for qwen vl model (#13478)

author Xuan-Son Nguyen <redacted>

Mon, 12 May 2025 13:06:51 +0000 (15:06 +0200)

committer GitHub <redacted>

Mon, 12 May 2025 13:06:51 +0000 (15:06 +0200)
author Xuan-Son Nguyen <redacted>
Mon, 12 May 2025 13:06:51 +0000 (15:06 +0200)
committer GitHub <redacted>
Mon, 12 May 2025 13:06:51 +0000 (15:06 +0200)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp

index 0adf03163fcc45b05e418982ce37ba09442af99d..41ba45a79b5aba26b8edf45821c318a91576ab2c 100644 (file)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1909,16 +1909,20 @@ struct clip_model_loader {
                      } break;
                  case PROJECTOR_TYPE_QWEN2VL:
                      {
-                        // max image size = sqrt(max_pixels)
-                        // https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
-                        hparams.image_size = 3584;
+                        // max image size = sqrt(max_pixels) = 3584
+                        // ref: https://huggingface.co/Qwen/Qwen2-VL-7B-Instruct/blob/main/preprocessor_config.json
+                        // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
+                        // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
+                        hparams.image_size = 1024;
                          hparams.warmup_image_size = hparams.patch_size * 8;
                      } break;
                  case PROJECTOR_TYPE_QWEN25VL:
                      {
                          // max image size = sqrt(max_pixels)
                          // https://huggingface.co/Qwen/Qwen2.5-VL-7B-Instruct/blob/main/preprocessor_config.json
-                        hparams.image_size = 3584;
+                        // however, the model use unreasonable memory past 1024 size, we force it to 1024 otherwise it's unusable
+                        // ref: https://huggingface.co/Qwen/Qwen2-VL-2B-Instruct/discussions/10
+                        hparams.image_size = 1024;
                          hparams.warmup_image_size = hparams.patch_size * 8;
                          get_u32(KEY_WIN_ATTN_PATTERN, hparams.n_wa_pattern);
                      } break;
author	Xuan-Son Nguyen <redacted>
	Mon, 12 May 2025 13:06:51 +0000 (15:06 +0200)
committer	GitHub <redacted>
	Mon, 12 May 2025 13:06:51 +0000 (15:06 +0200)