Fixes Qwen2.5VL segfault during inference with https://github.com/ggml-org/llama...

author LostRuins Concedo <redacted>

Sun, 27 Apr 2025 10:43:37 +0000 (18:43 +0800)

committer GitHub <redacted>

Sun, 27 Apr 2025 10:43:37 +0000 (12:43 +0200)
author LostRuins Concedo <redacted>
Sun, 27 Apr 2025 10:43:37 +0000 (18:43 +0800)
committer GitHub <redacted>
Sun, 27 Apr 2025 10:43:37 +0000 (12:43 +0200)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp

index b6a1f40e8a580fba18b4e3c5bcbbac00f8b9b3d0..3cd27d5b17a083f436af4a93d6fc20be1d69f556 100644 (file)
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -1718,7 +1718,8 @@ struct clip_model_loader {
  
                  if (ctx_clip.proj_type == PROJECTOR_TYPE_MINICPMV
                          || ctx_clip.proj_type == PROJECTOR_TYPE_GLM_EDGE
-                        || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL) {
+                        || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN2VL
+                        || ctx_clip.proj_type == PROJECTOR_TYPE_QWEN25VL) {
                      n_layer += 1;
                  }
  
@@ -2744,7 +2745,7 @@ bool clip_image_preprocess(struct clip_ctx * ctx, const clip_image_u8 * img, str
          }
          return true;
      }
-    else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
+    else if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
          clip_image_u8 resized;
          auto patch_size = clip_get_patch_size(ctx) * 2;
          int nx = ceil((float)img->nx / patch_size) * patch_size;
@@ -3139,7 +3140,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
      else {
          // non-minicpmv models
  
-        if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL) {
+        if (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
              // pw * ph = number of tokens output by ViT after apply patch merger
              // ipw * ipw = number of vision token been processed inside ViT
              const int merge_ratio = 2;
@@ -3279,7 +3280,7 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
          }
      }
  
-    if (use_window_attn && ctx->proj_type == PROJECTOR_TYPE_QWEN25VL) {
+    if (use_window_attn && (ctx->proj_type == PROJECTOR_TYPE_QWEN2VL || ctx->proj_type == PROJECTOR_TYPE_QWEN25VL)) {
          struct ggml_tensor * window_idx = ggml_graph_get_tensor(gf, "window_idx");
          struct ggml_tensor * inv_window_idx = ggml_graph_get_tensor(gf, "inv_window_idx");
          struct ggml_tensor * window_mask = ggml_graph_get_tensor(gf, "window_mask");
author	LostRuins Concedo <redacted>
	Sun, 27 Apr 2025 10:43:37 +0000 (18:43 +0800)
committer	GitHub <redacted>
	Sun, 27 Apr 2025 10:43:37 +0000 (12:43 +0200)