mtmd: fix use_non_causal being reported incorrectly (#18793)

author Xuan-Son Nguyen <redacted>

Tue, 13 Jan 2026 11:19:38 +0000 (12:19 +0100)

committer GitHub <redacted>

Tue, 13 Jan 2026 11:19:38 +0000 (12:19 +0100)
author Xuan-Son Nguyen <redacted>
Tue, 13 Jan 2026 11:19:38 +0000 (12:19 +0100)
committer GitHub <redacted>
Tue, 13 Jan 2026 11:19:38 +0000 (12:19 +0100)
diff --git a/src/models/gemma3n-iswa.cpp b/src/models/gemma3n-iswa.cpp

index 93defbeef9c1d8cd0f477d8530a3342db92d0588..51acab14908372c912a926d512540f839a156a60 100644 (file)
--- a/src/models/gemma3n-iswa.cpp
+++ b/src/models/gemma3n-iswa.cpp
@@ -258,12 +258,12 @@ ggml_tensor * llm_build_gemma3n_iswa::get_per_layer_inputs() {
          res->add_input(std::move(inp));
      } else {
          // Vision embedding path: use padding token (ID=0) embedding
+        // TODO: verify if this is the correct behavior in transformers implementation
          const int64_t embd_size = model.tok_embd_per_layer->ne[0];  // n_embd_altup * n_layer
  
-        // Extract and dequantize padding token embedding (column 0)
-        ggml_tensor * padding_q = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
-        ggml_tensor * padding_f32 = ggml_new_tensor_1d(ctx0, GGML_TYPE_F32, embd_size);
-        inp_per_layer = ggml_cpy(ctx0, padding_q, padding_f32);
+        // Extract and dequantize padding token embedding (row 0)
+        ggml_tensor * padding = ggml_view_1d(ctx0, model.tok_embd_per_layer, embd_size, 0);
+        inp_per_layer = ggml_cast(ctx0, padding, GGML_TYPE_F32);
  
          // Reshape to [n_embd_altup, n_layer, 1]
          inp_per_layer = ggml_reshape_3d(ctx0, inp_per_layer, n_embd_altup, n_layer, 1);
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp

index 97c83de5fb3a719b286c261620542f0cc773be8b..fd2fb07fd2fe67aaff0b7b6673a4d37c03ea28a0 100644 (file)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -3808,18 +3808,6 @@ bool clip_is_glm(const struct clip_ctx * ctx) {
      return ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE;
  }
  
-bool clip_is_mrope(const struct clip_ctx * ctx) {
-    switch (ctx->proj_type()) {
-        case PROJECTOR_TYPE_QWEN2VL:
-        case PROJECTOR_TYPE_QWEN25VL:
-        case PROJECTOR_TYPE_QWEN3VL:
-        case PROJECTOR_TYPE_GLM4V:
-            return true;
-        default:
-            return false;
-    }
-}
-
  bool clip_is_llava(const struct clip_ctx * ctx) {
      return ctx->model.hparams.has_llava_projector;
  }
diff --git a/tools/mtmd/clip.h b/tools/mtmd/clip.h

index 79df0136ba7db1e488fb14458582224b616f8c25..27ee0201825962ce7b1b0fad2446cccef3a9d7bc 100644 (file)
--- a/tools/mtmd/clip.h
+++ b/tools/mtmd/clip.h
@@ -104,7 +104,6 @@ bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct
  
  int clip_is_minicpmv(const struct clip_ctx * ctx);
  bool clip_is_glm(const struct clip_ctx * ctx);
-bool clip_is_mrope(const struct clip_ctx * ctx);
  bool clip_is_llava(const struct clip_ctx * ctx);
  // note for contributor: this clip_is_(model) pattern is deprecated
  //                       do NOT add new functions like this
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp

index b68de74296e836f8f5c49c98af86e1175fc8d7cf..f25706987e10c4f69e21e338ea2edb79cab5d401 100644 (file)
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -146,8 +146,6 @@ struct mtmd_context {
      bool        tok_row_end_trail = false;
      bool        ov_img_first      = false;
  
-    bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
-
      // string template for slice image delimiters with row/col (idefics3)
      std::string sli_img_start_tmpl;
  
@@ -217,7 +215,6 @@ struct mtmd_context {
  
      void init_vision() {
          GGML_ASSERT(ctx_v != nullptr);
-        use_mrope = clip_is_mrope(ctx_v);
  
          projector_type proj = clip_get_projector_type(ctx_v);
          int minicpmv_version = clip_is_minicpmv(ctx_v);
@@ -627,7 +624,7 @@ struct mtmd_tokenizer {
                  }
  
                  mtmd_image_tokens_ptr image_tokens(new mtmd_image_tokens);
-                if (ctx->use_mrope) {
+                if (mtmd_decode_use_mrope(ctx)) {
                      // for Qwen2VL, we need this information for M-RoPE decoding positions
                      image_tokens->nx = clip_n_output_tokens_x(ctx->ctx_v, batch_f32.entries[0].get());
                      image_tokens->ny = clip_n_output_tokens_y(ctx->ctx_v, batch_f32.entries[0].get());
@@ -863,10 +860,7 @@ float * mtmd_get_output_embd(mtmd_context * ctx) {
  
  bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
      switch (ctx->proj_type_v()) {
-        case PROJECTOR_TYPE_QWEN2VL:
-        case PROJECTOR_TYPE_QWEN25VL:
-        case PROJECTOR_TYPE_QWEN3VL:
-        case PROJECTOR_TYPE_YOUTUVL:
+        case PROJECTOR_TYPE_GEMMA3:
              return true;
          default:
              return false;
@@ -874,7 +868,15 @@ bool mtmd_decode_use_non_causal(mtmd_context * ctx) {
  }
  
  bool mtmd_decode_use_mrope(mtmd_context * ctx) {
-    return ctx->use_mrope;
+    switch (ctx->proj_type_v()) {
+        case PROJECTOR_TYPE_QWEN2VL:
+        case PROJECTOR_TYPE_QWEN25VL:
+        case PROJECTOR_TYPE_QWEN3VL:
+        case PROJECTOR_TYPE_GLM4V:
+            return true;
+        default:
+            return false;
+    }
  }
  
  bool mtmd_support_vision(mtmd_context * ctx) {
author	Xuan-Son Nguyen <redacted>
	Tue, 13 Jan 2026 11:19:38 +0000 (12:19 +0100)
committer	GitHub <redacted>
	Tue, 13 Jan 2026 11:19:38 +0000 (12:19 +0100)
src/models/gemma3n-iswa.cpp		patch \| blob \| history
tools/mtmd/clip.cpp		patch \| blob \| history
tools/mtmd/clip.h		patch \| blob \| history
tools/mtmd/mtmd.cpp		patch \| blob \| history