if chkhsh == "9b1be57e70d20d9501b2b3186e792d81181ae36ada3903c26f9fea418cf87206":
# ref: https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base
res = "llada-moe"
+ if chkhsh == "53e325976a6e142379c19b09afcae354f2f496f147afa8f9e189a33fe4e3024e":
+ # ref: https://huggingface.co/ibm-granite/granite-docling-258M
+ res = "granite-docling"
if res is None:
logger.warning("\n")
self.tensor_map = gguf.get_tensor_name_map(gguf.MODEL_ARCH.MMPROJ, self.block_count)
# load preprocessor config
+ self.preprocessor_config = {}
if not self.is_mistral_format:
with open(self.dir_model / "preprocessor_config.json", "r", encoding="utf-8") as f:
self.preprocessor_config = json.load(f)
self.gguf_writer.add_vision_projection_dim(self.n_embd_text)
# vision config
- self.gguf_writer.add_vision_image_size(self.find_vparam(["image_size"]))
+ self.image_size = self.find_vparam(["image_size"])
+ self.gguf_writer.add_vision_image_size(self.image_size)
self.gguf_writer.add_vision_patch_size(self.find_vparam(["patch_size"]))
self.gguf_writer.add_vision_embedding_length(self.find_vparam(["hidden_size"]))
self.gguf_writer.add_vision_feed_forward_length(self.find_vparam(["intermediate_size"]))
self.gguf_writer.add_vision_projector_scale_factor(self.global_config.get("scale_factor", 2))
self.gguf_writer.add_vision_use_gelu(True)
+ # Add the preprocessor longest edge size
+ preproc_image_size = self.preprocessor_config.get("size", {}).get("longest_edge", self.image_size)
+ self.gguf_writer.add_vision_preproc_image_size(preproc_image_size)
+
def tensor_force_quant(self, name, new_name, bid, n_dims):
if ".embeddings." in name:
return gguf.GGMLQuantizationType.F32
{"name": "exaone4", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/LGAI-EXAONE/EXAONE-4.0-32B", },
{"name": "mellum", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/JetBrains/Mellum-4b-base", },
{"name": "llada-moe", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/inclusionAI/LLaDA-MoE-7B-A1B-Base", },
+ {"name": "granite-docling", "tokt": TOKENIZER_TYPE.BPE, "repo": "https://huggingface.co/ibm-granite/granite-docling-258M", },
]
# some models are known to be broken upstream, so we will skip them as exceptions
class ClipVision:
IMAGE_SIZE = "clip.vision.image_size"
+ PREPROC_IMAGE_SIZE = "clip.vision.preproc_image_size"
PATCH_SIZE = "clip.vision.patch_size"
EMBEDDING_LENGTH = "clip.vision.embedding_length"
FEED_FORWARD_LENGTH = "clip.vision.feed_forward_length"
def add_vision_image_size(self, value: int) -> None:
self.add_uint32(Keys.ClipVision.IMAGE_SIZE, value)
+ def add_vision_preproc_image_size(self, value: int) -> None:
+ self.add_uint32(Keys.ClipVision.PREPROC_IMAGE_SIZE, value)
+
def add_vision_image_mean(self, values: Sequence[float]) -> None:
self.add_array(Keys.ClipVision.IMAGE_MEAN, values)
case LLAMA_VOCAB_PRE_TYPE_OLMO:
case LLAMA_VOCAB_PRE_TYPE_JAIS:
case LLAMA_VOCAB_PRE_TYPE_TRILLION:
+ case LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING:
regex_exprs = {
"'s|'t|'re|'ve|'m|'ll|'d| ?\\p{L}+| ?\\p{N}+| ?[^\\s\\p{L}\\p{N}]+|\\s+(?!\\S)",
};
tokenizer_pre == "trillion") {
pre_type = LLAMA_VOCAB_PRE_TYPE_TRILLION;
clean_spaces = false;
+ } else if (
+ tokenizer_pre == "granite-docling") {
+ pre_type = LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING;
+ clean_spaces = false;
} else if (
tokenizer_pre == "bailingmoe" ||
tokenizer_pre == "llada-moe") {
// pre-tokenization types
enum llama_vocab_pre_type {
- LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
- LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
- LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
- LLAMA_VOCAB_PRE_TYPE_MPT = 5,
- LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
- LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
- LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
- LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
- LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
- LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
- LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
- LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
- LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
- LLAMA_VOCAB_PRE_TYPE_PORO = 15,
- LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
- LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
- LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
- LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
- LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
- LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
- LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
- LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
- LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
- LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
- LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
- LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
- LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
- LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
- LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
- LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
- LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
- LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
- LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
- LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
- LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
- LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
- LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
- LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
+ LLAMA_VOCAB_PRE_TYPE_DEFAULT = 0,
+ LLAMA_VOCAB_PRE_TYPE_LLAMA3 = 1,
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_LLM = 2,
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK_CODER = 3,
+ LLAMA_VOCAB_PRE_TYPE_FALCON = 4,
+ LLAMA_VOCAB_PRE_TYPE_MPT = 5,
+ LLAMA_VOCAB_PRE_TYPE_STARCODER = 6,
+ LLAMA_VOCAB_PRE_TYPE_GPT2 = 7,
+ LLAMA_VOCAB_PRE_TYPE_REFACT = 8,
+ LLAMA_VOCAB_PRE_TYPE_COMMAND_R = 9,
+ LLAMA_VOCAB_PRE_TYPE_STABLELM2 = 10,
+ LLAMA_VOCAB_PRE_TYPE_QWEN2 = 11,
+ LLAMA_VOCAB_PRE_TYPE_OLMO = 12,
+ LLAMA_VOCAB_PRE_TYPE_DBRX = 13,
+ LLAMA_VOCAB_PRE_TYPE_SMAUG = 14,
+ LLAMA_VOCAB_PRE_TYPE_PORO = 15,
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM3 = 16,
+ LLAMA_VOCAB_PRE_TYPE_CHATGLM4 = 17,
+ LLAMA_VOCAB_PRE_TYPE_VIKING = 18,
+ LLAMA_VOCAB_PRE_TYPE_JAIS = 19,
+ LLAMA_VOCAB_PRE_TYPE_TEKKEN = 20,
+ LLAMA_VOCAB_PRE_TYPE_SMOLLM = 21,
+ LLAMA_VOCAB_PRE_TYPE_CODESHELL = 22,
+ LLAMA_VOCAB_PRE_TYPE_BLOOM = 23,
+ LLAMA_VOCAB_PRE_TYPE_GPT3_FINNISH = 24,
+ LLAMA_VOCAB_PRE_TYPE_EXAONE = 25,
+ LLAMA_VOCAB_PRE_TYPE_CHAMELEON = 26,
+ LLAMA_VOCAB_PRE_TYPE_MINERVA = 27,
+ LLAMA_VOCAB_PRE_TYPE_DEEPSEEK3_LLM = 28,
+ LLAMA_VOCAB_PRE_TYPE_GPT4O = 29,
+ LLAMA_VOCAB_PRE_TYPE_SUPERBPE = 30,
+ LLAMA_VOCAB_PRE_TYPE_TRILLION = 31,
+ LLAMA_VOCAB_PRE_TYPE_BAILINGMOE = 32,
+ LLAMA_VOCAB_PRE_TYPE_LLAMA4 = 33,
+ LLAMA_VOCAB_PRE_TYPE_PIXTRAL = 34,
+ LLAMA_VOCAB_PRE_TYPE_SEED_CODER = 35,
+ LLAMA_VOCAB_PRE_TYPE_HUNYUAN = 36,
+ LLAMA_VOCAB_PRE_TYPE_KIMI_K2 = 37,
+ LLAMA_VOCAB_PRE_TYPE_HUNYUAN_DENSE = 38,
+ LLAMA_VOCAB_PRE_TYPE_GROK_2 = 39,
+ LLAMA_VOCAB_PRE_TYPE_GRANITE_DOCLING = 40,
};
struct LLM_KV;
// vision-specific
#define KEY_IMAGE_SIZE "clip.vision.image_size"
+#define KEY_PREPROC_IMAGE_SIZE "clip.vision.preproc_image_size"
#define KEY_PATCH_SIZE "clip.vision.patch_size"
#define KEY_IMAGE_MEAN "clip.vision.image_mean"
#define KEY_IMAGE_STD "clip.vision.image_std"
int32_t projection_dim;
int32_t n_head;
int32_t n_layer;
- int32_t proj_scale_factor = 0; // idefics3
+ // idefics3
+ int32_t preproc_image_size = 0;
+ int32_t proj_scale_factor = 0;
float image_mean[3];
float image_std[3];
if (is_vision) {
get_u32(KEY_IMAGE_SIZE, hparams.image_size);
+ get_u32(KEY_PREPROC_IMAGE_SIZE, hparams.preproc_image_size, false);
get_u32(KEY_PATCH_SIZE, hparams.patch_size);
get_u32(KEY_IMAGE_CROP_RESOLUTION, hparams.image_crop_resolution, false);
get_i32(KEY_MINICPMV_VERSION, hparams.minicpmv_version, false); // legacy
// res_imgs->data[0] = *res;
res_imgs->entries.push_back(std::move(img_f32));
return true;
- }
- else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
+ } else if (ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3) {
+ // The refined size has two steps:
+ // 1. Resize w/ aspect-ratio preserving such that the longer side is
+ // the preprocessor longest size
+ // 2. Resize w/out preserving aspect ratio such that both sides are
+ // multiples of image_size (always rounding up)
+ //
+ // CITE: https://github.com/huggingface/transformers/blob/main/src/transformers/models/idefics3/image_processing_idefics3.py#L737
+ const clip_image_size refined_size = image_manipulation::calc_size_preserved_ratio(
+ original_size, params.image_size, params.preproc_image_size);
+
+ llava_uhd::slice_instructions instructions;
+ instructions.overview_size = clip_image_size{params.image_size, params.image_size};
+ instructions.refined_size = refined_size;
+ instructions.grid_size = clip_image_size{
+ static_cast<int>(std::ceil(static_cast<float>(refined_size.width) / params.image_size)),
+ static_cast<int>(std::ceil(static_cast<float>(refined_size.height) / params.image_size)),
+ };
+ for (int y = 0; y < refined_size.height; y += params.image_size) {
+ for (int x = 0; x < refined_size.width; x += params.image_size) {
+ instructions.slices.push_back(llava_uhd::slice_coordinates{
+ /* x */x,
+ /* y */y,
+ /* size */clip_image_size{
+ std::min(params.image_size, refined_size.width - x),
+ std::min(params.image_size, refined_size.height - y)
+ }
+ });
+ }
+ }
+ auto imgs = llava_uhd::slice_image(img, instructions);
+
+ // cast and normalize to f32
+ for (size_t i = 0; i < imgs.size(); ++i) {
+ // clip_image_save_to_bmp(*imgs[i], "slice_" + std::to_string(i) + ".bmp");
+ clip_image_f32_ptr res(clip_image_f32_init());
+ normalize_image_u8_to_f32(*imgs[i], *res, params.image_mean, params.image_std);
+ res_imgs->entries.push_back(std::move(res));
+ }
+
+ res_imgs->grid_x = instructions.grid_size.width;
+ res_imgs->grid_y = instructions.grid_size.height;
+ return true;
+ } else if (ctx->proj_type() == PROJECTOR_TYPE_GLM_EDGE
|| ctx->proj_type() == PROJECTOR_TYPE_GEMMA3
- || ctx->proj_type() == PROJECTOR_TYPE_IDEFICS3
|| ctx->proj_type() == PROJECTOR_TYPE_INTERNVL // TODO @ngxson : support dynamic resolution
) {
clip_image_u8 resized_image;
MTMD_SLICE_TMPL_MINICPMV_2_5,
MTMD_SLICE_TMPL_MINICPMV_2_6,
MTMD_SLICE_TMPL_LLAMA4,
- // TODO @ngxson : add support for idefics (SmolVLM)
+ MTMD_SLICE_TMPL_IDEFICS3,
};
const char * mtmd_default_marker() {
// for llava-uhd style models, we need special tokens in-between slices
// minicpmv calls them "slices", llama 4 calls them "tiles"
mtmd_slice_tmpl slice_tmpl = MTMD_SLICE_TMPL_NONE;
- llama_token tok_ov_img_start = LLAMA_TOKEN_NULL; // overview image
- llama_token tok_ov_img_end = LLAMA_TOKEN_NULL; // overview image
- llama_token tok_slices_start = LLAMA_TOKEN_NULL; // start of all slices
- llama_token tok_slices_end = LLAMA_TOKEN_NULL; // end of all slices
- llama_token tok_sli_img_start = LLAMA_TOKEN_NULL; // single slice start
- llama_token tok_sli_img_end = LLAMA_TOKEN_NULL; // single slice end
- llama_token tok_sli_img_mid = LLAMA_TOKEN_NULL; // between 2 slices
- llama_token tok_row_end = LLAMA_TOKEN_NULL; // end of row
+ std::vector<llama_token> tok_ov_img_start; // overview image
+ std::vector<llama_token> tok_ov_img_end; // overview image
+ std::vector<llama_token> tok_slices_start; // start of all slices
+ std::vector<llama_token> tok_slices_end; // end of all slices
+ std::vector<llama_token> tok_sli_img_start; // single slice start
+ std::vector<llama_token> tok_sli_img_end; // single slice end
+ std::vector<llama_token> tok_sli_img_mid; // between 2 slices
+ std::vector<llama_token> tok_row_end; // end of row
bool tok_row_end_trail = false;
bool ov_img_first = false;
bool use_mrope = false; // for Qwen2VL, we need to use M-RoPE
+ // string template for slice image delimiters with row/col (idefics3)
+ std::string sli_img_start_tmpl;
+
// for whisper, we pre-calculate the mel filter bank
whisper_preprocessor::whisper_filters w_filters;
// minicpmv 2.5 format:
// <image> (overview) </image><slice><image> (slice) </image><image> (slice) </image>\n ... </slice>
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_5;
- tok_ov_img_start = lookup_token("<image>");
- tok_ov_img_end = lookup_token("</image>");
- tok_slices_start = lookup_token("<slice>");
- tok_slices_end = lookup_token("</slice>");
+ tok_ov_img_start = {lookup_token("<image>")};
+ tok_ov_img_end = {lookup_token("</image>")};
+ tok_slices_start = {lookup_token("<slice>")};
+ tok_slices_end = {lookup_token("</slice>")};
tok_sli_img_start = tok_ov_img_start;
tok_sli_img_end = tok_ov_img_end;
- tok_row_end = lookup_token("\n");
+ tok_row_end = {lookup_token("\n")};
tok_row_end_trail = false; // no trailing end-of-row token
ov_img_first = true;
// minicpmv 2.6 format:
// <image> (overview) </image><slice> (slice) </slice><slice> (slice) </slice>\n ...
slice_tmpl = MTMD_SLICE_TMPL_MINICPMV_2_6;
- tok_ov_img_start = lookup_token("<image>");
- tok_ov_img_end = lookup_token("</image>");
- tok_sli_img_start = lookup_token("<slice>");
- tok_sli_img_end = lookup_token("</slice>");
- tok_row_end = lookup_token("\n");
+ tok_ov_img_start = {lookup_token("<image>")};
+ tok_ov_img_end = {lookup_token("</image>")};
+ tok_sli_img_start = {lookup_token("<slice>")};
+ tok_sli_img_end = {lookup_token("</slice>")};
+ tok_row_end = {lookup_token("\n")};
tok_row_end_trail = false; // no trailing end-of-row token
ov_img_first = true;
// <|image|> (overview) <-- overview image is last
// <|image_end|>
slice_tmpl = MTMD_SLICE_TMPL_LLAMA4;
- tok_ov_img_start = lookup_token("<|image|>");
- tok_sli_img_mid = lookup_token("<|tile_x_separator|>");
- tok_row_end = lookup_token("<|tile_y_separator|>");
+ tok_ov_img_start = {lookup_token("<|image|>")};
+ tok_sli_img_mid = {lookup_token("<|tile_x_separator|>")};
+ tok_row_end = {lookup_token("<|tile_y_separator|>")};
tok_row_end_trail = true; // add trailing end-of-row token
ov_img_first = false; // overview image is last
}
} else if (proj == PROJECTOR_TYPE_IDEFICS3) {
// https://github.com/huggingface/transformers/blob/a42ba80fa520c784c8f11a973ca9034e5f859b79/src/transformers/models/idefics3/processing_idefics3.py#L192-L215
- img_beg = "<fake_token_around_image><global-img>";
- img_end = "<fake_token_around_image>";
+ slice_tmpl = MTMD_SLICE_TMPL_IDEFICS3;
+ tok_ov_img_start = {lookup_token("\n"), lookup_token("<fake_token_around_image>"), lookup_token("<global-img>")};
+ tok_ov_img_end = {lookup_token("<fake_token_around_image>")};
+ tok_row_end = {lookup_token("\n")};
+ img_beg = "<fake_token_around_image>";
+ sli_img_start_tmpl = "<fake_token_around_image><row_%d_col_%d>";
} else if (proj == PROJECTOR_TYPE_PIXTRAL) {
// https://github.com/huggingface/transformers/blob/1cd110c6cb6a6237614130c470e9a902dbc1a4bd/docs/source/en/model_doc/pixtral.md
ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_5
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_MINICPMV_2_6
|| ctx->slice_tmpl == MTMD_SLICE_TMPL_LLAMA4
+ || ctx->slice_tmpl == MTMD_SLICE_TMPL_IDEFICS3
) {
const int n_col = batch_f32.grid_x;
const int n_row = batch_f32.grid_y;
// add overview image (first)
if (ctx->ov_img_first) {
- if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
- add_text({ctx->tok_ov_img_start});
- }
+ add_text(ctx->tok_ov_img_start);
cur.entries.emplace_back(std::move(ov_chunk));
- if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
- add_text({ctx->tok_ov_img_end});
- }
+ add_text(ctx->tok_ov_img_end);
}
// add slices (or tiles)
if (!chunks.empty()) {
GGML_ASSERT((int)chunks.size() == n_row * n_col);
- if (ctx->tok_slices_start != LLAMA_TOKEN_NULL) {
- add_text({ctx->tok_slices_start});
- }
+ add_text(ctx->tok_slices_start);
for (int y = 0; y < n_row; y++) {
for (int x = 0; x < n_col; x++) {
const bool is_last_in_row = (x == n_col - 1);
- if (ctx->tok_sli_img_start != LLAMA_TOKEN_NULL) {
- add_text({ctx->tok_sli_img_start});
+ if (!ctx->tok_sli_img_start.empty()) {
+ add_text(ctx->tok_sli_img_start);
+ } else if (!ctx->sli_img_start_tmpl.empty()) {
+ // If using a template to preceed a slice image
+ const size_t sz = std::snprintf(nullptr, 0, ctx->sli_img_start_tmpl.c_str(), y+1, x+1) + 1;
+ std::unique_ptr<char[]> buf(new char[sz]);
+ std::snprintf(buf.get(), sz, ctx->sli_img_start_tmpl.c_str(), y+1, x+1);
+ add_text(std::string(buf.get(), buf.get() + sz - 1), true);
}
cur.entries.emplace_back(std::move(chunks[y * n_col + x]));
- if (ctx->tok_sli_img_end != LLAMA_TOKEN_NULL) {
- add_text({ctx->tok_sli_img_end});
- }
- if (!is_last_in_row && ctx->tok_sli_img_mid != LLAMA_TOKEN_NULL) {
- add_text({ctx->tok_sli_img_mid});
+ add_text(ctx->tok_sli_img_end);
+ if (!is_last_in_row) {
+ add_text(ctx->tok_sli_img_mid);
}
}
- if ((y != n_row - 1 || ctx->tok_row_end_trail) && ctx->tok_row_end != LLAMA_TOKEN_NULL) {
- add_text({ctx->tok_row_end});
+ if ((y != n_row - 1 || ctx->tok_row_end_trail)) {
+ add_text(ctx->tok_row_end);
}
}
- if (ctx->tok_slices_end != LLAMA_TOKEN_NULL) {
- add_text({ctx->tok_slices_end});
- }
+ add_text(ctx->tok_slices_end);
}
// add overview image (last)
if (!ctx->ov_img_first) {
- if (ctx->tok_ov_img_start != LLAMA_TOKEN_NULL) {
- add_text({ctx->tok_ov_img_start});
- }
+ add_text(ctx->tok_ov_img_start);
cur.entries.emplace_back(std::move(ov_chunk));
- if (ctx->tok_ov_img_end != LLAMA_TOKEN_NULL) {
- add_text({ctx->tok_ov_img_end});
- }
+ add_text(ctx->tok_ov_img_end);
}
} else {
ctx->image_embd_v.resize(image_tokens->n_tokens() * n_mmproj_embd);
bool ok = false;
- if (clip_is_llava(ctx_clip) || clip_is_minicpmv(ctx_clip) || clip_is_glm(ctx_clip)) {
+ if (clip_is_llava(ctx_clip)
+ || clip_is_minicpmv(ctx_clip)
+ || clip_is_glm(ctx_clip)) {
// TODO @ngxson : llava does not support batched encoding ; this should be fixed inside clip_image_batch_encode()
const auto & entries = image_tokens->batch_f32.entries;
for (size_t i = 0; i < entries.size(); i++) {
add_test_vision "ggml-org/InternVL3-1B-Instruct-GGUF:Q8_0"
add_test_vision "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"
add_test_vision "ggml-org/LFM2-VL-450M-GGUF:Q8_0"
+add_test_vision "ggml-org/granite-docling-258M-GGUF:Q8_0"
add_test_audio "ggml-org/ultravox-v0_5-llama-3_2-1b-GGUF:Q8_0"
add_test_audio "ggml-org/Qwen2.5-Omni-3B-GGUF:Q4_K_M"