model : qwen3vl reranker text support (#20332)

author Vinicios Lugli <redacted>

Tue, 10 Mar 2026 22:40:14 +0000 (19:40 -0300)

committer GitHub <redacted>

Tue, 10 Mar 2026 22:40:14 +0000 (23:40 +0100)
author Vinicios Lugli <redacted>
Tue, 10 Mar 2026 22:40:14 +0000 (19:40 -0300)
committer GitHub <redacted>
Tue, 10 Mar 2026 22:40:14 +0000 (23:40 +0100)
diff --git a/convert_hf_to_gguf.py b/convert_hf_to_gguf.py

index 083b5bca9e94b31b8b0b3caffea84f360fc65e7a..30347f7389f871e77aacc92a3a4f055a9664749e 100755 (executable)
--- a/convert_hf_to_gguf.py
+++ b/convert_hf_to_gguf.py
@@ -4390,15 +4390,31 @@ class Qwen3Model(Qwen2Model):
          hparams = ModelBase.load_hparams(self.dir_model, is_mistral_format=False)
          self.origin_hf_arch = hparams.get('architectures', [None])[0]
  
-        # a bit hacky, but currently the only way to detect if this is a rerank model
-        # ref: https://huggingface.co/Qwen/Qwen3-Reranker-0.6B
+        if self._is_qwen3_reranker():
+            self._find_rerank_config()
+
+    def _is_qwen3_reranker(self) -> bool:
          readme_path = self.dir_model / "README.md"
          readme_text = ""
          if readme_path.exists():
              with readme_path.open("r", encoding="utf-8") as f:
                  readme_text = f.read()
-        if "# Qwen3-Reranker" in readme_text:
-            self._find_rerank_config()
+
+        name_hints = [
+            str(self.dir_model.name),
+            str(self.hparams.get("_name_or_path", "")),
+            str(self.hparams.get("model_type", "")),
+            str(self.origin_hf_arch or ""),
+        ]
+        name_hints = [hint.lower() for hint in name_hints if hint]
+
+        if "# qwen3-reranker" in readme_text.lower() or "# qwen3-vl-reranker" in readme_text.lower():
+            return True
+
+        if any("qwen3-reranker" in hint or "qwen3-vl-reranker" in hint for hint in name_hints):
+            return True
+
+        return "sequenceclassification" in (self.origin_hf_arch or "").lower()
  
      def set_vocab(self):
          # deal with intern-s1-mini
diff --git a/src/llama-arch.cpp b/src/llama-arch.cpp

index ce49bbd988e761a6d9c095d4c45fc4cb47d90bd7..204105b6dd05e0daa275ae91e7eb8ac85e42311e 100644 (file)
--- a/src/llama-arch.cpp
+++ b/src/llama-arch.cpp
@@ -1087,6 +1087,7 @@ static std::set<llm_tensor> llm_get_tensor_names(llm_arch arch) {
                  LLM_TENSOR_TOKEN_EMBD,
                  LLM_TENSOR_OUTPUT_NORM,
                  LLM_TENSOR_OUTPUT,
+                LLM_TENSOR_CLS_OUT,
                  LLM_TENSOR_ATTN_NORM,
                  LLM_TENSOR_ATTN_Q,
                  LLM_TENSOR_ATTN_Q_NORM,
diff --git a/src/llama-graph.cpp b/src/llama-graph.cpp

index 5f875136a1c5691a4497f61d54f008b6674a1ecc..528f8e54584a6c18c5a8c114fd90a8decc046cb2 100644 (file)
--- a/src/llama-graph.cpp
+++ b/src/llama-graph.cpp
@@ -250,7 +250,7 @@ void llm_graph_input_cls::set_input(const llama_ubatch * ubatch) {
  
          const bool last = (
               cparams.pooling_type == LLAMA_POOLING_TYPE_LAST ||
-            (cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && arch == LLM_ARCH_QWEN3) // qwen3 reranking & embedding models use last token
+            (cparams.pooling_type == LLAMA_POOLING_TYPE_RANK && (arch == LLM_ARCH_QWEN3 || arch == LLM_ARCH_QWEN3VL)) // qwen3 reranking & embedding models use last token
          );
  
          for (int i = 0; i < n_tokens; ++i) {
@@ -2552,7 +2552,7 @@ void llm_graph_context::build_pooling(
                  }
  
                  // softmax for qwen3 reranker
-                if (arch == LLM_ARCH_QWEN3) {
+                if (arch == LLM_ARCH_QWEN3 || arch == LLM_ARCH_QWEN3VL) {
                      cur = ggml_soft_max(ctx0, cur);
                  }
              } break;
author	Vinicios Lugli <redacted>
	Tue, 10 Mar 2026 22:40:14 +0000 (19:40 -0300)
committer	GitHub <redacted>
	Tue, 10 Mar 2026 22:40:14 +0000 (23:40 +0100)
convert_hf_to_gguf.py		patch \| blob \| history
src/llama-arch.cpp		patch \| blob \| history
src/llama-graph.cpp		patch \| blob \| history