mtmd: build_attn modified, flash_attn on/off via ctx_params (#19729)

author Saba Fallah <redacted>

Thu, 19 Feb 2026 12:50:29 +0000 (13:50 +0100)

committer GitHub <redacted>

Thu, 19 Feb 2026 12:50:29 +0000 (13:50 +0100)
author Saba Fallah <redacted>
Thu, 19 Feb 2026 12:50:29 +0000 (13:50 +0100)
committer GitHub <redacted>
Thu, 19 Feb 2026 12:50:29 +0000 (13:50 +0100)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp

index 0796644f08eb919d0e533274b4da2a6e77e2b4ff..57f6dd00a38fda3f10ce1420e9b0d055f91e91c2 100644 (file)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -628,9 +628,6 @@ ggml_tensor * clip_graph::build_attn(
          ggml_tensor * v = ggml_permute(ctx0, v_cur, 1, 2, 0, 3);
          v = ggml_cont(ctx0, v);
  
-        const auto n_tokens = q->ne[1];
-        const auto n_head   = q->ne[2];
-
          ggml_tensor * kq = ggml_mul_mat(ctx0, k, q);
          // F32 may not needed for vision encoders?
          // ggml_mul_mat_set_prec(kq, GGML_PREC_F32);
@@ -639,7 +636,7 @@ ggml_tensor * clip_graph::build_attn(
  
          ggml_tensor * kqv = ggml_mul_mat(ctx0, v, kq);
          cur = ggml_permute(ctx0, kqv, 0, 2, 1, 3);
-        cur = ggml_cont_2d(ctx0, cur, cur->ne[0]*n_head, n_tokens);
+        cur = ggml_cont_2d(ctx0, cur, cur->ne[0] * cur->ne[1], cur->ne[2] * cur->ne[3]);
      }
  
      cb(cur, "kqv_out", il);
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp

index b7636279cb61b9e1228025666af426f0c2b1092e..af733d97d563c0e82dfef8e21de4fec5ff02f3e1 100644 (file)
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -175,7 +175,7 @@ struct mtmd_context {
  
          clip_context_params ctx_clip_params {
              /* use_gpu           */ ctx_params.use_gpu,
-            /* flash_attn_type   */ CLIP_FLASH_ATTN_TYPE_AUTO,
+            /* flash_attn_type   */ mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type),
              /* image_min_tokens  */ ctx_params.image_min_tokens,
              /* image_max_tokens  */ ctx_params.image_max_tokens,
              /* warmup            */ ctx_params.warmup,
diff --git a/tools/mtmd/tests.sh b/tools/mtmd/tests.sh

index 012958e0e0e531c06bff6bedd1f32796069266fd..d2b7e684af4580d272672a8069d3b1e2335070ab 100755 (executable)
--- a/tools/mtmd/tests.sh
+++ b/tools/mtmd/tests.sh
@@ -28,6 +28,14 @@ if [ "${1:-}" = "huge" ]; then
      echo "Include BIG and HUGE models..."
  fi
  
+# Check if the second argument is "flash", then enable flash attention
+# This is useful to test if flash attention off works correctly
+FLASH_ATTN="on"
+if [ "${2:-}" = "flash_off" ] || [ "${1:-}" = "flash_off" ]; then
+    FLASH_ATTN="off"
+    echo "Flash attention disabled..."
+fi
+
  ###############
  
  arr_prefix=()
@@ -143,6 +151,7 @@ for i in "${!arr_hf[@]}"; do
          -hf $(printf %q "$hf") \
          --image $(printf %q "$SCRIPT_DIR/$inp_file") \
          --temp 0 -n 128 \
+        --flash-attn $(printf %q "$FLASH_ATTN") \
          ${extra_args}"
  
      # if extra_args does not contain -p, we add a default prompt
author	Saba Fallah <redacted>
	Thu, 19 Feb 2026 12:50:29 +0000 (13:50 +0100)
committer	GitHub <redacted>
	Thu, 19 Feb 2026 12:50:29 +0000 (13:50 +0100)
tools/mtmd/clip.cpp		patch \| blob \| history
tools/mtmd/mtmd.cpp		patch \| blob \| history
tools/mtmd/tests.sh		patch \| blob \| history