llama-quant: add support for mmproj (#16592)

author Xuan-Son Nguyen <redacted>

Wed, 15 Oct 2025 12:48:08 +0000 (14:48 +0200)

committer GitHub <redacted>

Wed, 15 Oct 2025 12:48:08 +0000 (14:48 +0200)
author Xuan-Son Nguyen <redacted>
Wed, 15 Oct 2025 12:48:08 +0000 (14:48 +0200)
committer GitHub <redacted>
Wed, 15 Oct 2025 12:48:08 +0000 (14:48 +0200)
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp

index 869e4dccf0dc9922bfb41f9476108b2d3cbbb6d5..b7e00b275b6f7a68bf33ab9a0d8e898c1ad11806 100644 (file)
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -5,6 +5,7 @@
  #include <map>
  
  static const std::map<llm_arch, const char *> LLM_ARCH_NAMES = {
+    { LLM_ARCH_CLIP,             "clip"             }, // dummy, only used by llama-quantize
      { LLM_ARCH_LLAMA,            "llama"            },
      { LLM_ARCH_LLAMA4,           "llama4"           },
      { LLM_ARCH_DECI,             "deci"             },
@@ -275,6 +276,10 @@ static const std::map<llm_kv, const char *> LLM_KV_NAMES = {
  };
  
  static const std::map<llm_arch, std::map<llm_tensor, const char *>> LLM_TENSOR_NAMES = {
+    {
+        LLM_ARCH_CLIP,
+        {},
+    },
      {
          LLM_ARCH_LLAMA,
          {
diff --git a/src/llama-arch.h b/src/llama-arch.h

index c3ae71655b17b417acce9f052925296802ba21a2..c41de89859d5c44641d3d4f14efe680d9af652b9 100644 (file)
--- a/src/llama-arch.h
+++ b/src/llama-arch.h
@@ -9,6 +9,7 @@
  //
  
  enum llm_arch {
+    LLM_ARCH_CLIP,
      LLM_ARCH_LLAMA,
      LLM_ARCH_LLAMA4,
      LLM_ARCH_DECI,
diff --git a/src/llama-model.cpp b/src/llama-model.cpp

index 0cdad9babd9b27c083488ecefd25f0ee02d1cbfa..5002bd42ff04ecfb28fc779f016e98d4de4e11f6 100644 (file)
--- a/src/llama-model.cpp
+++ b/src/llama-model.cpp
@@ -478,7 +478,8 @@ void llama_model::load_hparams(llama_model_loader & ml) {
      ml.get_key(LLM_KV_GENERAL_NAME, name, false);
  
      // everything past this point is not vocab-related
-    if (hparams.vocab_only) {
+    // for CLIP models, we only need to load tensors, no hparams
+    if (hparams.vocab_only || ml.get_arch() == LLM_ARCH_CLIP) {
          return;
      }
  
@@ -20013,6 +20014,7 @@ int32_t llama_n_head(const llama_model * model) {
  llama_rope_type llama_model_rope_type(const llama_model * model) {
      switch (model->arch) {
          // these models do not use RoPE
+        case LLM_ARCH_CLIP:
          case LLM_ARCH_GPT2:
          case LLM_ARCH_GPTJ:
          case LLM_ARCH_MPT:
diff --git a/src/llama-quant.cpp b/src/llama-quant.cpp

index 97228b2a693241045d3888736ddc06776c8c2506..6dd40412b488ee947ac1f9bf68fad9f1180946de 100644 (file)
--- a/src/llama-quant.cpp
+++ b/src/llama-quant.cpp
@@ -701,6 +701,7 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
          });
      }
  
+    bool is_clip_model = false;
      for (const auto * it : tensors) {
          const struct ggml_tensor * tensor = it->tensor;
  
@@ -714,12 +715,14 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
          } else if (name == LLM_TN(model.arch)(LLM_TENSOR_OUTPUT, "weight")) {
              qs.has_output = true;
          }
+
+        is_clip_model |= name.rfind("mm.", 0) == 0; // check the "mm." prefix
      }
  
      qs.n_ffn_down = qs.n_ffn_gate = qs.n_ffn_up = (int)model.hparams.n_layer;
  
      // sanity checks for models that have attention layers
-    if (qs.n_attention_wv != 0)
+    if (qs.n_attention_wv != 0 && !is_clip_model)
      {
          const auto & n_head_kv_iter = model.hparams.n_head_kv_arr.begin();
          // attention layers have a non-zero number of kv heads
@@ -881,6 +884,9 @@ static void llama_model_quantize_impl(const std::string & fname_inp, const std::
          // do not quantize relative position bias (T5)
          quantize &= name.find("attn_rel_b.weight") == std::string::npos;
  
+        // do not quantize specific multimodal tensors
+        quantize &= name.find(".position_embd.") == std::string::npos;
+
          ggml_type new_type;
          void * new_data;
          size_t new_size;
diff --git a/src/llama.cpp b/src/llama.cpp

index 38700f97a068818f186fffe9b0480866e2ae75a0..ab2e9868af4688d740f5e7c0038e7fd7d9b9d2a7 100644 (file)
--- a/src/llama.cpp
+++ b/src/llama.cpp
@@ -124,6 +124,9 @@ static int llama_model_load(const std::string & fname, std::vector<std::string>
          } catch(const std::exception & e) {
              throw std::runtime_error("error loading model hyperparameters: " + std::string(e.what()));
          }
+        if (model.arch == LLM_ARCH_CLIP) {
+            throw std::runtime_error("CLIP cannot be used as main model, use it with --mmproj instead");
+        }
          try {
              model.load_vocab(ml);
          } catch(const std::exception & e) {
author	Xuan-Son Nguyen <redacted>
	Wed, 15 Oct 2025 12:48:08 +0000 (14:48 +0200)
committer	GitHub <redacted>
	Wed, 15 Oct 2025 12:48:08 +0000 (14:48 +0200)
src/llama-arch.cpp		patch \| blob \| history
src/llama-arch.h		patch \| blob \| history
src/llama-model.cpp		patch \| blob \| history
src/llama-quant.cpp		patch \| blob \| history
src/llama.cpp		patch \| blob \| history