clip : fix visual encoders with no CLS (#11982)

author Alex Brooks <redacted>

Fri, 21 Feb 2025 06:11:03 +0000 (23:11 -0700)

committer GitHub <redacted>

Fri, 21 Feb 2025 06:11:03 +0000 (08:11 +0200)
author Alex Brooks <redacted>
Fri, 21 Feb 2025 06:11:03 +0000 (23:11 -0700)
committer GitHub <redacted>
Fri, 21 Feb 2025 06:11:03 +0000 (08:11 +0200)
diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp

index 271cf2a2ac1ad2e1950980deb477eca2ab398c81..bf70e351c9030753598dccf48f7b7f502de305e3 100644 (file)
--- a/examples/llava/clip.cpp
+++ b/examples/llava/clip.cpp
@@ -2712,9 +2712,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
  
              if (!ctx->has_glm_projector) {
                  struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches");
+                // The patches vector is used to get rows to index into the embeds with;
+                // we should skip dim 0 only if we have CLS to avoid going out of bounds
+                // when retrieving the rows.
+                int patch_offset = ctx->has_class_embedding ? 1 : 0;
                  int* patches_data = (int*)malloc(ggml_nbytes(patches));
                  for (int i = 0; i < num_patches; i++) {
-                    patches_data[i] = i + 1;
+                    patches_data[i] = i + patch_offset;
                  }
                  ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches));
                  free(patches_data);
author	Alex Brooks <redacted>
	Fri, 21 Feb 2025 06:11:03 +0000 (23:11 -0700)
committer	GitHub <redacted>
	Fri, 21 Feb 2025 06:11:03 +0000 (08:11 +0200)