mtmd : Use RMS norm for InternVL 3 38B and 78B mmproj (#13459)

author City <redacted>

Sun, 11 May 2025 22:39:06 +0000 (00:39 +0200)

committer GitHub <redacted>

Sun, 11 May 2025 22:39:06 +0000 (00:39 +0200)
author City <redacted>
Sun, 11 May 2025 22:39:06 +0000 (00:39 +0200)
committer GitHub <redacted>
Sun, 11 May 2025 22:39:06 +0000 (00:39 +0200)
diff --git a/tools/mtmd/clip.cpp b/tools/mtmd/clip.cpp

index 3f11c301a7212d40fd827d660a09a55f875175d1..0adf03163fcc45b05e418982ce37ba09442af99d 100644 (file)
--- a/tools/mtmd/clip.cpp
+++ b/tools/mtmd/clip.cpp
@@ -879,9 +879,15 @@ struct clip_graph {
          // add CLS token
          inp = ggml_concat(ctx0, inp, model.class_embedding, 1);
  
+        // The larger models use a different ViT, which uses RMS norm instead of layer norm
+        // ref: https://github.com/ggml-org/llama.cpp/pull/13443#issuecomment-2869786188
+        norm_type norm_t = (hparams.n_embd == 3200 && hparams.n_layer == 45)
+            ? NORM_TYPE_RMS // 6B ViT (Used by InternVL 2.5/3 - 26B, 38B, 78B)
+            : NORM_TYPE_NORMAL; // 300M ViT (Used by all smaller InternVL models)
+
          ggml_tensor * cur = build_vit(
                                  inp, n_pos,
-                                NORM_TYPE_NORMAL,
+                                norm_t,
                                  hparams.ffn_op,
                                  model.position_embeddings,
                                  nullptr);