mtmd: pad mask for qwen2.5vl (#16954)

author Xuan-Son Nguyen <redacted>

Mon, 3 Nov 2025 09:25:55 +0000 (10:25 +0100)

committer GitHub <redacted>

Mon, 3 Nov 2025 09:25:55 +0000 (10:25 +0100)
author Xuan-Son Nguyen <redacted>
Mon, 3 Nov 2025 09:25:55 +0000 (10:25 +0100)
committer GitHub <redacted>
Mon, 3 Nov 2025 09:25:55 +0000 (10:25 +0100)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp

index 0784e69fcdf931002253ca94f712f3ba10bd605d..60516d582a5f36fcda47b1ca42adb484db8da76e 100644 (file)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -761,6 +761,15 @@ struct clip_graph {
              ggml_set_name(window_mask, "window_mask");
              ggml_set_input(window_mask);
  
+            // if flash attn is used, we need to pad the mask and cast to f16
+            if (ctx->flash_attn_type == CLIP_FLASH_ATTN_TYPE_ENABLED) {
+                int n_pad = GGML_PAD(window_mask->ne[1], GGML_KQ_MASK_PAD) - window_mask->ne[1];
+                if (n_pad > 0) {
+                    window_mask = ggml_pad(ctx0, window_mask, 0, n_pad, 0, 0);
+                }
+                window_mask = ggml_cast(ctx0, window_mask, GGML_TYPE_F16);
+            }
+
              // inpL shape: [n_embd, n_patches_x * n_patches_y, batch_size]
              GGML_ASSERT(batch_size == 1);
              inpL = ggml_reshape_2d(ctx0, inpL, n_embd * 4, n_patches_x * n_patches_y * batch_size / 4);
author	Xuan-Son Nguyen <redacted>
	Mon, 3 Nov 2025 09:25:55 +0000 (10:25 +0100)
committer	GitHub <redacted>
	Mon, 3 Nov 2025 09:25:55 +0000 (10:25 +0100)