clip : remove boi/eoi embeddings for GLM-edge model (#13081)

author Xuan-Son Nguyen <redacted>

Thu, 24 Apr 2025 20:17:04 +0000 (22:17 +0200)

committer GitHub <redacted>

Thu, 24 Apr 2025 20:17:04 +0000 (22:17 +0200)
author Xuan-Son Nguyen <redacted>
Thu, 24 Apr 2025 20:17:04 +0000 (22:17 +0200)
committer GitHub <redacted>
Thu, 24 Apr 2025 20:17:04 +0000 (22:17 +0200)
diff --git a/examples/llava/clip-impl.h b/examples/llava/clip-impl.h

index 8d310fb0271c5d15a8acc1fedcaa926915e26f37..53ac38130476557a34a63d3f8e611b4a316d9cd3 100644 (file)
--- a/examples/llava/clip-impl.h
+++ b/examples/llava/clip-impl.h
@@ -90,8 +90,6 @@
  #define TN_GLM_ADAPTER_D_H_2_4H "adapter.linear.dense_h_to_4h.%s"
  #define TN_GLM_ADAPTER_GATE     "adapter.linear.gate.%s"
  #define TN_GLM_ADAPTER_D_4H_2_H "adapter.linear.dense_4h_to_h.%s"
-#define TN_GLM_BOI_W            "adapter.boi"
-#define TN_GLM_EOI_W            "adapter.eoi"
  
  enum projector_type {
      PROJECTOR_TYPE_MLP,
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp

index 4eec4a264679888a754dd60e46ea7bdc295778ad..9a5ab7c819585c3b1109f4f0099115f35b0c05b9 100644 (file)
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -244,8 +244,6 @@ struct clip_vision_model {
      //GLMV-Edge projection
      struct ggml_tensor * mm_model_adapter_conv_w = nullptr;
      struct ggml_tensor * mm_model_adapter_conv_b = nullptr;
-    struct ggml_tensor * boi_w = nullptr;
-    struct ggml_tensor * eoi_w = nullptr;
  
      // MobileVLM projection
      struct ggml_tensor * mm_model_mlp_1_w = nullptr;
@@ -1697,8 +1695,6 @@ struct clip_model_loader {
                      vision_model.mm_model_mlp_1_w = get_tensor(string_format(TN_GLM_ADAPTER_D_H_2_4H,"weight"));
                      vision_model.mm_model_mlp_2_w = get_tensor(string_format(TN_GLM_ADAPTER_GATE,"weight"));
                      vision_model.mm_model_mlp_3_w = get_tensor(string_format(TN_GLM_ADAPTER_D_4H_2_H,"weight"));
-                    vision_model.boi_w = get_tensor(TN_GLM_BOI_W);
-                    vision_model.eoi_w = get_tensor(TN_GLM_EOI_W);
                  } break;
              case PROJECTOR_TYPE_MERGER:
                  {
@@ -2593,8 +2589,7 @@ void clip_free(clip_ctx * ctx) {
  }
  
  size_t clip_embd_nbytes(const struct clip_ctx * ctx) {
-    int extra_tokens = ctx->has_glm_projector ? 2 : 0;
-    return (clip_n_patches(ctx) + extra_tokens) * clip_n_mmproj_embd(ctx) * sizeof(float);
+    return clip_n_patches(ctx) * clip_n_mmproj_embd(ctx) * sizeof(float);
  }
  
  size_t clip_embd_nbytes_by_img(const struct clip_ctx * ctx, int img_h, int img_w) {
@@ -2790,9 +2785,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
      }
      if (ctx->has_glm_projector) {
          GGML_ASSERT(batch_size == 1);
-        ggml_tensor * boi = ctx->vision_model.boi_w;
-        ggml_backend_tensor_get(boi,vec,0,ggml_nbytes(boi));
-        vec = (float*)(vec+ggml_nelements(boi)); //offset for boi
      }
  
      // build the inference graph
@@ -3001,13 +2993,6 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
      // copy the embeddings to the location passed by the user
      ggml_backend_tensor_get(embeddings, vec, 0, ggml_nbytes(embeddings));
  
-    if (ctx->has_glm_projector) {
-        //eoi
-        ggml_tensor * eoi = ctx->vision_model.eoi_w;
-        int offset = ggml_nelements(embeddings);
-        ggml_backend_tensor_get(eoi, vec+offset, 0, ggml_nbytes(eoi));
-    }
-
      return true;
  }
  
diff --git a/examples/llava/mtmd.cpp b/examples/llava/mtmd.cpp

index 11ca7b30f1ac608aa09912ef6cb6238bb4b5028c..a994ef0166e6a607ac1092dbb56fdeac0f39ed71 100644 (file)
--- a/examples/llava/mtmd.cpp
+++ b/examples/llava/mtmd.cpp
@@ -186,6 +186,11 @@ int32_t mtmd_tokenize(mtmd_context * ctx,
          marker_modified = "<start_of_image>" + ctx->image_marker + "<end_of_image>";
          string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
  
+    } else if (proj_type == PROJECTOR_TYPE_GLM_EDGE) {
+        // <|begin_of_image|> ... (image embeddings) ... <|end_of_image|>
+        marker_modified = "<|begin_of_image|>" + ctx->image_marker + "<|end_of_image|>";
+        string_replace_all(prompt_modified, ctx->image_marker, marker_modified);
+
      } else if (proj_type == PROJECTOR_TYPE_IDEFICS3) {
          // https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
          marker_modified = "<fake_token_around_image><global-img>" + ctx->image_marker + "<fake_token_around_image>";
author	Xuan-Son Nguyen <redacted>
	Thu, 24 Apr 2025 20:17:04 +0000 (22:17 +0200)
committer	GitHub <redacted>
	Thu, 24 Apr 2025 20:17:04 +0000 (22:17 +0200)
examples/llava/clip-impl.h		patch \| blob \| history
examples/llava/clip.cpp		patch \| blob \| history
examples/llava/mtmd.cpp		patch \| blob \| history