{
// ref: https://huggingface.co/mistral-community/pixtral-12b/blob/main/preprocessor_config.json
// TODO: verify the image_min_tokens
+ hparams.n_merge = 1; // the original pixtral does not use patch merging
hparams.rope_theta = 10000.0f;
get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.n_merge, false);
hparams.set_limit_image_tokens(8, 1024);
}
mtmd_context_params mtmd_context_params_default() {
- mtmd_context_params params;
- params.use_gpu = true;
- params.print_timings = true;
- params.n_threads = 4;
- params.verbosity = GGML_LOG_LEVEL_INFO;
- params.image_marker = MTMD_DEFAULT_IMAGE_MARKER;
- params.media_marker = mtmd_default_marker();
- params.flash_attn_type = LLAMA_FLASH_ATTN_TYPE_AUTO;
- params.image_min_tokens = -1;
- params.image_max_tokens = -1;
+ mtmd_context_params params {
+ /* use_gpu */ true,
+ /* print_timings */ true,
+ /* n_threads */ 4,
+ /* verbosity */ GGML_LOG_LEVEL_INFO,
+ /* image_marker */ MTMD_DEFAULT_IMAGE_MARKER,
+ /* media_marker */ mtmd_default_marker(),
+ /* flash_attn_type */ LLAMA_FLASH_ATTN_TYPE_AUTO,
+ /* image_min_tokens */ -1,
+ /* image_max_tokens */ -1,
+ };
return params;
}
throw std::runtime_error("media_marker must not be empty");
}
- clip_context_params ctx_clip_params;
- ctx_clip_params.use_gpu = ctx_params.use_gpu;
- ctx_clip_params.verbosity = ctx_params.verbosity;
- ctx_clip_params.flash_attn_type = mtmd_get_clip_flash_attn_type(ctx_params.flash_attn_type);
- // custom image token limits
- ctx_clip_params.image_min_tokens = ctx_params.image_min_tokens;
- ctx_clip_params.image_max_tokens = ctx_params.image_max_tokens;
+ clip_context_params ctx_clip_params {
+ /* use_gpu */ ctx_params.use_gpu,
+ /* verbosity */ ctx_params.verbosity,
+ /* flash_attn_type */ CLIP_FLASH_ATTN_TYPE_AUTO,
+ /* image_min_tokens */ ctx_params.image_min_tokens,
+ /* image_max_tokens */ ctx_params.image_max_tokens,
+ };
auto res = clip_init(mmproj_fname, ctx_clip_params);
ctx_v = res.ctx_v;