]> git.djapps.eu Git - pkg/ggml/sources/llama.cpp/commitdiff
mtmd : fix Pixtral OOM with large images by capping image_size to 1024 (#14326)
authoryuiseki <redacted>
Sun, 22 Jun 2025 12:44:57 +0000 (21:44 +0900)
committerGitHub <redacted>
Sun, 22 Jun 2025 12:44:57 +0000 (14:44 +0200)
Mistral Small 2506 models using Pixtral vision encoder were running out
of GPU memory when processing images larger than 1024x1024 pixels due to
exponential memory growth from unlimited image size.

This fix applies the same 1024x1024 limit used by Qwen2VL models to
prevent OOM issues while maintaining compatibility with existing models.

tools/mtmd/clip.cpp

index 30283d6f1f032e70a2fe3a4797fedc0566ae6c5b..a990520ed3fbbc3c006a8d9107d3655b8419f0ac 100644 (file)
@@ -2211,6 +2211,9 @@ struct clip_model_loader {
                     {
                         hparams.rope_theta = 10000.0f;
                         hparams.warmup_image_size = hparams.patch_size * 8;
+                        // Mistral Small 2506 needs 1024x1024 image size cap to prevent OOM
+                        // ref: https://github.com/ggml-org/llama.cpp/issues/14310
+                        hparams.image_size = 1024;
                         get_u32(KEY_SPATIAL_MERGE_SIZE, hparams.spatial_merge_size, false);
                     } break;
                 case PROJECTOR_TYPE_GEMMA3: