mtmd: add more sanity checks (#21047)

author Xuan-Son Nguyen <redacted>

Fri, 27 Mar 2026 10:00:52 +0000 (11:00 +0100)

committer GitHub <redacted>

Fri, 27 Mar 2026 10:00:52 +0000 (11:00 +0100)
author Xuan-Son Nguyen <redacted>
Fri, 27 Mar 2026 10:00:52 +0000 (11:00 +0100)
committer GitHub <redacted>
Fri, 27 Mar 2026 10:00:52 +0000 (11:00 +0100)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp

index fd1cb0dfea4ec04249798327ccaf9af7072586d4..2947fcf9a3d96445e4fea4e253d5271e59260c18 100644 (file)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -1377,6 +1377,16 @@ struct clip_model_loader {
  
              // sanity check
              {
+                if (hparams.image_size < 0) {
+                    // note: some models having hparams.image_size == 0, which means the image size is dynamic
+                    throw std::runtime_error(string_format("%s: image_size (%d) cannot be negative\n", __func__, hparams.image_size));
+                }
+                if (hparams.patch_size <= 0) {
+                    throw std::runtime_error(string_format("%s: patch_size (%d) must be greater than 0\n", __func__, hparams.patch_size));
+                }
+                if (hparams.n_embd <= 0) {
+                    throw std::runtime_error(string_format("%s: n_embd (%d) must be greater than 0\n", __func__, hparams.n_embd));
+                }
                  if (hparams.image_max_pixels < hparams.image_min_pixels) {
                      throw std::runtime_error(string_format("%s: image_max_pixels (%d) is less than image_min_pixels (%d)\n", __func__, hparams.image_max_pixels, hparams.image_min_pixels));
                  }
diff --git a/tools/mtmd/mtmd-audio.cpp b/tools/mtmd/mtmd-audio.cpp

index 447f61aaa40b3d1126aeda4aac7d6a965f1661c3..e68387c2739457e6efbb6ef2ee0fb48268a16df7 100644 (file)
--- a/tools/mtmd/mtmd-audio.cpp
+++ b/tools/mtmd/mtmd-audio.cpp
@@ -13,23 +13,20 @@
  
  constexpr bool DEBUG = false;
  
-void mtmd_audio_cache::fill_sin_cos_table(int n) {
+void mtmd_audio_cache::fill_sin_cos_table(uint32_t n) {
      sin_vals.resize(n);
      cos_vals.resize(n);
-    for (int i = 0; i < n; i++) {
+    for (uint32_t i = 0; i < n; i++) {
          double theta = (2 * M_PI * i) / n;
          sin_vals[i]  = sinf(theta);
          cos_vals[i]  = cosf(theta);
      }
  }
  
-void mtmd_audio_cache::fill_hann_window(int length, bool periodic) {
+void mtmd_audio_cache::fill_hann_window(uint32_t length, bool periodic) {
      hann_window.resize(length);
-    int offset = -1;
-    if (periodic) {
-        offset = 0;
-    }
-    for (int i = 0; i < length; i++) {
+    int offset = periodic ? 0 : -1;
+    for (uint32_t i = 0; i < length; i++) {
          hann_window[i] = 0.5 * (1.0 - cosf((2.0 * M_PI * i) / (length + offset)));
      }
  }
@@ -165,6 +162,7 @@ static void dft_impl(const mtmd_audio_cache & cache, const float * in, int N, fl
  //              false = input is complex-valued (interleaved real/imag, stride 2)
  template <bool Inverse, bool RealInput>
  static void fft_impl(const mtmd_audio_cache & cache, float * in, int N, float * out) {
+    GGML_ASSERT(N > 0);
      const int n_sin_cos_vals = cache.sin_vals.size();
  
      if (N == 1) {
@@ -407,6 +405,8 @@ static bool log_mel_spectrogram(
      }
  
  
+    GGML_ASSERT(params.n_fft_bins > 0);
+    GGML_ASSERT(params.hop_length > 0);
      out.n_mel = params.n_mel;
      out.n_len = (n_samples - frame_size) / frame_step + 1;
      // TODO: handle these checks better
@@ -438,6 +438,7 @@ static bool log_mel_spectrogram(
  
      const int effective_n_len = n_samples_in / frame_step;
      if (params.norm_per_feature) {
+        GGML_ASSERT(effective_n_len > 1);
          for (int i = 0; i < out.n_mel; i++) {
              double mean = 0;
              for (int j = 0; j < effective_n_len; ++j) {
@@ -639,6 +640,7 @@ mtmd_audio_streaming_istft::mtmd_audio_streaming_istft(int n_fft, int hop_length
      padding_to_remove((n_fft - hop_length) / 2),
      ifft_in(n_fft * 2 * 4, 0.0f),  // extra space for recursive IFFT
      ifft_out(n_fft * 2 * 4, 0.0f) {
+    GGML_ASSERT(n_fft > 0 && hop_length > 0 && hop_length <= n_fft);
      cache.fill_sin_cos_table(n_fft);
      cache.fill_hann_window(n_fft, true);
  }
diff --git a/tools/mtmd/mtmd-audio.h b/tools/mtmd/mtmd-audio.h

index 016c7392e4fcbbff774578947c47d9f8454ddab0..53857a2eb5d55202ef19fe44f5e93baf9283a2a5 100644 (file)
--- a/tools/mtmd/mtmd-audio.h
+++ b/tools/mtmd/mtmd-audio.h
@@ -33,9 +33,9 @@ struct mtmd_audio_cache {
  
      mtmd_audio_mel_filters filters;
  
-    void fill_sin_cos_table(int n);
+    void fill_sin_cos_table(uint32_t n);
  
-    void fill_hann_window(int length, bool periodic);
+    void fill_hann_window(uint32_t length, bool periodic);
  
      // Build mel filterbank matrix [n_mel × n_fft_bins] at runtime.
      // n_fft_bins must be (N_fft / 2 + 1). Example: if N_fft=512 -> n_fft_bins=257.
diff --git a/tools/mtmd/mtmd-helper.cpp b/tools/mtmd/mtmd-helper.cpp

index 5bcb7ec1bc031d53710dbfb2edce53a681844ce2..778aacb61d21e731bbe9d5eaa90c2c8c9c04d5b0 100644 (file)
--- a/tools/mtmd/mtmd-helper.cpp
+++ b/tools/mtmd/mtmd-helper.cpp
@@ -127,6 +127,7 @@ struct decode_embd_batch {
      std::vector<int8_t>         logits;
      llama_batch batch;
      decode_embd_batch(float * embd, int32_t n_tokens, int n_pos_per_embd, int n_mmproj_embd) : n_pos_per_embd(n_pos_per_embd), n_mmproj_embd(n_mmproj_embd) {
+        GGML_ASSERT(n_tokens > 0 && n_pos_per_embd > 0 && n_mmproj_embd > 0);
          pos     .resize(n_tokens * n_pos_per_embd);
          n_seq_id.resize(n_tokens);
          seq_ids .resize(n_tokens + 1);
@@ -157,6 +158,7 @@ struct decode_embd_batch {
      // M-RoPE for image
      void set_position_mrope_2d(llama_pos pos_0, int nx, int ny, llama_seq_id seq_id) {
          GGML_ASSERT(n_pos_per_embd == 4);
+        GGML_ASSERT(nx > 0 && ny > 0 && nx * ny == batch.n_tokens);
          seq_id_0[0] = seq_id;
          for (int y = 0; y < ny; y++) {
              for (int x = 0; x < nx; x++) {
@@ -192,6 +194,7 @@ struct decode_embd_batch {
      }
  
      llama_batch get_view(int offset, int n_tokens) {
+        GGML_ASSERT(offset >= 0 && n_tokens > 0 && offset + n_tokens <= batch.n_tokens);
          llama_pos * pos_ptr;
          pos_view.clear();
          pos_view.reserve(n_tokens * n_pos_per_embd);
@@ -235,6 +238,7 @@ int32_t mtmd_helper_decode_image_chunk(
          llama_seq_id seq_id,
          int32_t n_batch,
          llama_pos * new_n_past) {
+    GGML_ASSERT(n_batch > 0);
      auto chunk_type = mtmd_input_chunk_get_type(chunk);
      const char * name = chunk_type == MTMD_INPUT_CHUNK_TYPE_IMAGE ? "image" : "audio";
      if (chunk_type == MTMD_INPUT_CHUNK_TYPE_TEXT) {
@@ -312,6 +316,7 @@ int32_t mtmd_helper_eval_chunk_single(mtmd_context * ctx,
          int32_t n_batch,
          bool logits_last,
          llama_pos * new_n_past) {
+    GGML_ASSERT(n_batch > 0);
      int32_t ret;
      llama_batch text_batch = llama_batch_init(n_batch, 0, 1);
      auto chunk_type = mtmd_input_chunk_get_type(chunk);
@@ -508,6 +513,11 @@ mtmd_bitmap * mtmd_helper_bitmap_init_from_file(mtmd_context * ctx, const char *
      fseek(f, 0, SEEK_END);
      long file_size = ftell(f);
      fseek(f, 0, SEEK_SET);
+    if (file_size < 0) {
+        LOG_ERR("Failed to get file size of %s\n", fname);
+        fclose(f);
+        return nullptr;
+    }
      buf.resize(file_size);
  
      size_t n_read = fread(buf.data(), 1, file_size, f);
diff --git a/tools/mtmd/mtmd-image.cpp b/tools/mtmd/mtmd-image.cpp

index b4464371920c174ccd75b6263a83c36b1b20238f..a2166622b7cac38f8c6239aff48f964e2060d158 100644 (file)
--- a/tools/mtmd/mtmd-image.cpp
+++ b/tools/mtmd/mtmd-image.cpp
@@ -99,6 +99,8 @@ struct img_tool {
      }
  
      static void crop(const clip_image_u8 & image, clip_image_u8 & dst, int x, int y, int w, int h) {
+        GGML_ASSERT(x >= 0 && y >= 0 && w > 0 && h > 0);
+        GGML_ASSERT(x + w <= image.nx && y + h <= image.ny);
          dst.nx = w;
          dst.ny = h;
          dst.buf.resize(3 * w * h);
@@ -196,6 +198,7 @@ struct img_tool {
  private:
      // Bilinear resize function
      static void resize_bilinear(const clip_image_u8 & src, clip_image_u8 & dst, int target_width, int target_height) {
+        GGML_ASSERT(src.nx >= 2 && src.ny >= 2);
          dst.nx = target_width;
          dst.ny = target_height;
          dst.buf.resize(3 * target_width * target_height);
@@ -207,8 +210,8 @@ private:
              for (int x = 0; x < target_width; x++) {
                  float px = x_ratio * x;
                  float py = y_ratio * y;
-                int x_floor = static_cast<int>(px);
-                int y_floor = static_cast<int>(py);
+                int x_floor = std::min(static_cast<int>(px), src.nx - 2);
+                int y_floor = std::min(static_cast<int>(py), src.ny - 2);
                  float x_lerp = px - x_floor;
                  float y_lerp = py - y_floor;
  
@@ -347,6 +350,7 @@ private:
          // Returns: kernel size (ksize) - number of input pixels that contribute to each output pixel
          auto precompute_weights = [&](int inSize, int outSize,
                                       std::vector<int> & bounds, std::vector<int32_t> & weights) -> int {
+            GGML_ASSERT(inSize > 0 && outSize > 0);
              double support, scale, filterscale;
              double center, ww, ss;
              int xx, x, ksize, xmin, xmax, xcnt;
diff --git a/tools/mtmd/mtmd.cpp b/tools/mtmd/mtmd.cpp

index d078120f761882a2a7aa1a0cbdb947abb5b1b14f..9c400ce1045c379e9c36631c961d969b4b33222a 100644 (file)
--- a/tools/mtmd/mtmd.cpp
+++ b/tools/mtmd/mtmd.cpp
@@ -641,6 +641,11 @@ struct mtmd_tokenizer {
                  add_text(ctx->img_beg, true); // add image begin token
              }
  
+            // sanity check
+            GGML_ASSERT(bitmap->nx > 0 && bitmap->ny > 0);
+            GGML_ASSERT(bitmap->data.size() == (size_t)bitmap->nx * bitmap->ny * 3);
+            GGML_ASSERT(ctx->image_preproc != nullptr);
+
              // convert mtmd_bitmap to clip_image_u8
              clip_image_u8_ptr img_u8(clip_image_u8_init());
              img_u8->nx = bitmap->nx;
@@ -649,7 +654,6 @@ struct mtmd_tokenizer {
              std::memcpy(img_u8->buf.data(), bitmap->data.data(), img_u8->nx * img_u8->ny * 3);
  
              // preprocess image
-            GGML_ASSERT(ctx->image_preproc != nullptr);
              clip_image_f32_batch batch_f32;
              bool ok = ctx->image_preproc->preprocess(*img_u8, batch_f32);
              if (!ok) {
@@ -773,6 +777,11 @@ struct mtmd_tokenizer {
                  add_text(ctx->aud_beg, true); // add audio begin token
              }
  
+            // sanity check
+            GGML_ASSERT(ctx->audio_preproc != nullptr);
+            GGML_ASSERT(bitmap->data.size() > sizeof(float));
+            GGML_ASSERT(bitmap->data.size() % sizeof(float) == 0);
+
              // preprocess audio
              std::vector<mtmd_audio_mel> mel_spec_chunks;
              const float * samples = (const float *)bitmap->data.data();
author	Xuan-Son Nguyen <redacted>
	Fri, 27 Mar 2026 10:00:52 +0000 (11:00 +0100)
committer	GitHub <redacted>
	Fri, 27 Mar 2026 10:00:52 +0000 (11:00 +0100)
tools/mtmd/clip.cpp		patch \| blob \| history
tools/mtmd/mtmd-audio.cpp		patch \| blob \| history
tools/mtmd/mtmd-audio.h		patch \| blob \| history
tools/mtmd/mtmd-helper.cpp		patch \| blob \| history
tools/mtmd/mtmd-image.cpp		patch \| blob \| history
tools/mtmd/mtmd.cpp		patch \| blob \| history