From: Alex Brooks Date: Fri, 21 Feb 2025 06:11:03 +0000 (-0700) Subject: clip : fix visual encoders with no CLS (#11982) X-Git-Tag: upstream/0.0.4853~104 X-Git-Url: https://git.djapps.eu/?a=commitdiff_plain;h=ee02ad02c56ff36a5edd22d8617ab3f9546ce7fe;p=pkg%2Fggml%2Fsources%2Fllama.cpp clip : fix visual encoders with no CLS (#11982) Signed-off-by: Alex-Brooks --- diff --git a/examples/llava/clip.cpp b/examples/llava/clip.cpp index 271cf2a2..bf70e351 100644 --- a/examples/llava/clip.cpp +++ b/examples/llava/clip.cpp @@ -2712,9 +2712,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima if (!ctx->has_glm_projector) { struct ggml_tensor * patches = ggml_graph_get_tensor(gf, "patches"); + // The patches vector is used to get rows to index into the embeds with; + // we should skip dim 0 only if we have CLS to avoid going out of bounds + // when retrieving the rows. + int patch_offset = ctx->has_class_embedding ? 1 : 0; int* patches_data = (int*)malloc(ggml_nbytes(patches)); for (int i = 0; i < num_patches; i++) { - patches_data[i] = i + 1; + patches_data[i] = i + patch_offset; } ggml_backend_tensor_set(patches, patches_data, 0, ggml_nbytes(patches)); free(patches_data);