[Models] Allow converting Qwen3-VL into Reranker model (#31890)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-01-08 16:10:15 +08:00
parent 573a1d1119
commit eac3b96ec0
8 changed files with 287 additions and 13 deletions
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -800,6 +800,7 @@ The following table lists those that are tested in vLLM.
 | `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
 | `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
 | `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
+| `Qwen3VLForConditionalGeneration`<sup>C</sup> | Qwen3-VL | T + I + V | `Qwen/Qwen3-VL-Embedding-2B`, etc. | ✅︎ | ✅︎ |
 | `SiglipModel` | SigLIP, SigLIP2 | T / I | `google/siglip-base-patch16-224`, `google/siglip2-base-patch16-224` | | |
 | `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |

@@ -816,10 +817,18 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
 | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
 |--------------|--------|--------|-------------------|----------------------|---------------------------|
 | `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ |
+| `Qwen3VLForSequenceClassification` | Qwen3-VL-Reranker | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-Reranker-2B`(see note), etc. | ✅︎ | ✅︎ |

 <sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))  
 \* Feature support is the same as that of the original model.

+!!! note
+    Similar to Qwen3-Reranker, you need to use the following `--hf_overrides` to load the official original `Qwen3-VL-Reranker`.
+
+    ```bash
+    vllm serve Qwen/Qwen3-VL-Reranker-2B --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
+    ```
+
 ## Model Support Policy

 At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
--- a/examples/pooling/pooling/vision_language_pooling.py
+++ b/examples/pooling/pooling/vision_language_pooling.py
@@ -133,6 +133,36 @@ def run_jinavl_reranker(query: Query) -> ModelRequestData:
    )


+def run_qwen3_vl(query: Query) -> ModelRequestData:
+    image_placeholder = "<vision_start><|image_pad|><vision_end>"
+    if query["modality"] == "text":
+        prompt = query["text"]
+        image = None
+    elif query["modality"] == "image":
+        prompt = image_placeholder
+        image = query["image"]
+    elif query["modality"] == "text+image":
+        text = query["text"]
+        prompt = f"{image_placeholder}\n{text}"
+        image = query["image"]
+    else:
+        modality = query["modality"]
+        raise ValueError(f"Unsupported query modality: '{modality}'")
+
+    engine_args = EngineArgs(
+        model="Qwen/Qwen3-VL-Embedding-2B",
+        runner="pooling",
+        max_model_len=8192,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    return ModelRequestData(
+        engine_args=engine_args,
+        prompt=prompt,
+        image=image,
+    )
+
+
 def run_siglip(query: Query) -> ModelRequestData:
    if query["modality"] == "text":
        prompt = query["text"]
@@ -353,6 +383,7 @@ model_example_map = {
    "clip": run_clip,
    "e5_v": run_e5_v,
    "jinavl_reranker": run_jinavl_reranker,
+    "qwen3_vl": run_qwen3_vl,
    "siglip": run_siglip,
    "vlm2vec_phi3v": run_vlm2vec_phi3v,
    "vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
--- a/examples/pooling/score/template/qwen3_vl_reranker.jinja
+++ b/examples/pooling/score/template/qwen3_vl_reranker.jinja
@@ -0,0 +1,23 @@
+<|im_start|>system
+Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>
+<|im_start|>user
+<Instruct>: {{
+    messages
+    | selectattr("role", "eq", "system")
+    | map(attribute="content")
+    | first
+    | default("Given a search query, retrieve relevant candidates that answer the query.")
+}}<Query>:{{
+    messages
+    | selectattr("role", "eq", "query")
+    | map(attribute="content")
+    | first
+}}
+<Document>:{{
+    messages
+    | selectattr("role", "eq", "document")
+    | map(attribute="content")
+    | first
+}}<|im_end|>
+<|im_start|>assistant
+
--- a/examples/pooling/score/vision_language_reranker.py
+++ b/examples/pooling/score/vision_language_reranker.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference with
+vision language reranker models for multimodal scoring tasks.
+
+Vision language rerankers score the relevance between a text query and
+multimodal documents (text + images/videos).
+"""
+
+from argparse import Namespace
+from collections.abc import Callable
+from dataclasses import asdict
+from pathlib import Path
+from typing import NamedTuple
+
+from vllm import LLM, EngineArgs
+from vllm.entrypoints.score_utils import ScoreMultiModalParam
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+TEMPLATE_HOME = Path(__file__).parent / "template"
+
+
+class RerankModelData(NamedTuple):
+    engine_args: EngineArgs
+    chat_template: str | None = None
+
+
+def run_jinavl_reranker(modality: str) -> RerankModelData:
+    assert modality == "image"
+
+    engine_args = EngineArgs(
+        model="jinaai/jina-reranker-m0",
+        runner="pooling",
+        max_model_len=32768,
+        trust_remote_code=True,
+        mm_processor_kwargs={
+            "min_pixels": 3136,
+            "max_pixels": 602112,
+        },
+        limit_mm_per_prompt={modality: 1},
+    )
+    return RerankModelData(
+        engine_args=engine_args,
+    )
+
+
+def run_qwen3_vl_reranker(modality: str) -> RerankModelData:
+    engine_args = EngineArgs(
+        model="Qwen/Qwen3-VL-Reranker-2B",
+        runner="pooling",
+        max_model_len=16384,
+        limit_mm_per_prompt={modality: 1},
+        # HuggingFace model configuration overrides required for compatibility
+        hf_overrides={
+            # Manually route to sequence classification architecture
+            # This tells vLLM to use Qwen3VLForSequenceClassification instead of
+            # the default Qwen3VLForConditionalGeneration
+            "architectures": ["Qwen3VLForSequenceClassification"],
+            # Specify which token logits to extract from the language model head
+            # The original reranker uses "no" and "yes" token logits for scoring
+            "classifier_from_token": ["no", "yes"],
+            # Enable special handling for original Qwen3-Reranker models
+            # This flag triggers conversion logic that transforms the two token
+            # vectors into a single classification vector
+            "is_original_qwen3_reranker": True,
+        },
+    )
+    chat_template_path = "qwen3_vl_reranker.jinja"
+    chat_template = (TEMPLATE_HOME / chat_template_path).read_text()
+    return RerankModelData(
+        engine_args=engine_args,
+        chat_template=chat_template,
+    )
+
+
+model_example_map: dict[str, Callable[[str], RerankModelData]] = {
+    "jinavl_reranker": run_jinavl_reranker,
+    "qwen3_vl_reranker": run_qwen3_vl_reranker,
+}
+
+
+def parse_args():
+    parser = FlexibleArgumentParser(
+        description="Demo on using vLLM for offline inference with "
+        "vision language reranker models for multimodal scoring tasks."
+    )
+    parser.add_argument(
+        "--model-name",
+        "-m",
+        type=str,
+        default="jinavl_reranker",
+        choices=model_example_map.keys(),
+        help="The name of the reranker model.",
+    )
+    parser.add_argument(
+        "--modality",
+        type=str,
+        default="image",
+        choices=["image", "video"],
+        help="Modality of the multimodal input (image or video).",
+    )
+    return parser.parse_args()
+
+
+def get_multi_modal_input(modality: str) -> tuple[str, ScoreMultiModalParam]:
+    # Sample query for testing the reranker
+    if modality == "image":
+        query = "A woman playing with her dog on a beach at sunset."
+        # Sample multimodal documents to be scored against the query
+        # Each document contains an image URL that will be fetched and processed
+        documents: ScoreMultiModalParam = {
+            "content": [
+                {
+                    "type": "text",
+                    "text": (
+                        "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "  # noqa: E501
+                        "as the dog offers its paw in a heartwarming display of companionship and trust."  # noqa: E501
+                    ),
+                },
+                {
+                    "type": "image_url",
+                    "image_url": {
+                        "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+                    },
+                },
+            ]
+        }
+    elif modality == "video":
+        query = "A girl is drawing pictures on an ipad."
+        # Sample video documents to be scored against the query
+        documents: ScoreMultiModalParam = {
+            "content": [
+                {
+                    "type": "text",
+                    "text": "A girl is drawing a guitar on her ipad with Apple Pencil.",
+                },
+                {
+                    "type": "video_url",
+                    "video_url": {
+                        "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
+                    },
+                },
+            ]
+        }
+    else:
+        raise ValueError(f"Unsupported modality: {modality}")
+    return query, documents
+
+
+def main(args: Namespace):
+    # Run the selected reranker model
+    modality = args.modality
+    model_request = model_example_map[args.model_name](modality)
+    engine_args = model_request.engine_args
+
+    llm = LLM(**asdict(engine_args))
+
+    query, documents = get_multi_modal_input(modality)
+    outputs = llm.score(query, documents, chat_template=model_request.chat_template)
+
+    print("-" * 50)
+    print(f"Model: {engine_args.model}")
+    print(f"Modality: {modality}")
+    print(f"Query: {query}")
+    print("Relevance scores:", [output.outputs.score for output in outputs])
+    print("-" * 50)
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -612,6 +612,15 @@ _AUTOMATIC_CONVERTED_MODELS = {
        "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
    ),
    "Qwen3ForTokenClassification": _HfExamplesInfo("bd2lcco/Qwen3-0.6B-finetuned"),
+    "Qwen3VLForSequenceClassification": _HfExamplesInfo(
+        "Qwen/Qwen3-VL-Reranker-2B",
+        is_available_online=False,
+        hf_overrides={
+            "architectures": ["Qwen3VLForSequenceClassification"],
+            "classifier_from_token": ["no", "yes"],
+            "is_original_qwen3_reranker": True,
+        },
+    ),
 }

 _MULTIMODAL_EXAMPLE_MODELS = {
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -11,6 +11,7 @@ from vllm.entrypoints.chat_utils import (
    ChatCompletionContentPartImageEmbedsParam,
    ChatCompletionContentPartImageParam,
    ChatCompletionContentPartTextParam,
+    ChatCompletionContentPartVideoParam,
    ChatTemplateResolutionError,
    MultiModalItemTracker,
    _ContentPart,
@@ -27,6 +28,7 @@ ScoreContentPartParam: TypeAlias = (
    ChatCompletionContentPartImageParam
    | ChatCompletionContentPartImageEmbedsParam
    | ChatCompletionContentPartTextParam
+    | ChatCompletionContentPartVideoParam
 )


--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -333,9 +333,14 @@ def as_seq_cls_model(cls: _T) -> _T:
            )

        def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
-            text_config = self.config.get_text_config()
-            tokens = getattr(text_config, "classifier_from_token", None)
-            method = getattr(text_config, "method", None)
+            hf_config = self.config
+            text_config = hf_config.get_text_config()
+            tokens = getattr(
+                hf_config,
+                "classifier_from_token",
+                getattr(text_config, "classifier_from_token", None),
+            )
+            method = getattr(hf_config, "method", getattr(text_config, "method", None))

            def auto_set_score_bias(weights):
                for name, weight in weights:
@@ -366,9 +371,14 @@ def as_seq_cls_model(cls: _T) -> _T:
 class SequenceClassificationConfig(VerifyAndUpdateConfig):
    @staticmethod
    def verify_and_update_config(vllm_config: "VllmConfig") -> None:
-        text_config = vllm_config.model_config.hf_config.get_text_config()
-        method = getattr(text_config, "method", None)
-        tokens = getattr(text_config, "classifier_from_token", None)
+        hf_config = vllm_config.model_config.hf_config
+        text_config = hf_config.get_text_config()
+        method = getattr(hf_config, "method", getattr(text_config, "method", None))
+        tokens = getattr(
+            hf_config,
+            "classifier_from_token",
+            getattr(text_config, "classifier_from_token", None),
+        )

        if method is None:
            return
@@ -378,8 +388,10 @@ class SequenceClassificationConfig(VerifyAndUpdateConfig):

        if method == "from_2_way_softmax":
            assert len(tokens) == 2
+            hf_config.num_labels = 1
            text_config.num_labels = 1
        else:
+            hf_config.num_labels = len(tokens)
            text_config.num_labels = len(tokens)

        # `llm as reranker` defaults to not using separating token.
@@ -396,9 +408,14 @@ def load_weights_using_from_2_way_softmax(

    model_config = model.vllm_config.model_config
    quant_config = model.vllm_config.quant_config
-    text_config = model.config.get_text_config()
+    hf_config = model.config
+    text_config = hf_config.get_text_config()

-    tokens = getattr(text_config, "classifier_from_token", [])
+    tokens = getattr(
+        hf_config,
+        "classifier_from_token",
+        getattr(text_config, "classifier_from_token", []),
+    )
    tokens = cast(list[int], tokens)
    assert len(tokens) == 2

@@ -409,10 +426,15 @@ def load_weights_using_from_2_way_softmax(
        # embed_tokens is the assumed name for input embeddings. If the model does not
        # have this attribute, we fall back to get_input_embeddings(), which is used by
        # the Transformers modeling backend.
+        text_backbone = (
+            model.get_language_model().model
+            if hasattr(model, "get_language_model")
+            else model.model
+        )
        embed_tokens = (
-            model.model.embed_tokens
-            if hasattr(model.model, "embed_tokens")
-            else model.model.get_input_embeddings()
+            text_backbone.embed_tokens
+            if hasattr(text_backbone, "embed_tokens")
+            else text_backbone.get_input_embeddings()
        )
        model.lm_head = model.lm_head.tie_weights(embed_tokens)

@@ -516,8 +538,9 @@ def seq_cls_model_loader(model, weights: Iterable[tuple[str, torch.Tensor]]):
    #   - GemmaForCausalLM
    #     - bge-reranker-v2-gemma

-    text_config = model.vllm_config.model_config.hf_config.get_text_config()
-    method = getattr(text_config, "method", None)
+    hf_config = model.vllm_config.model_config.hf_config
+    text_config = hf_config.get_text_config()
+    method = getattr(hf_config, "method", getattr(text_config, "method", None))
    assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported"
    return SEQ_CLS_LOAD_METHODS[method](model, weights)

--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -256,6 +256,10 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
        model_config.hf_config.method = "from_2_way_softmax"


+class Qwen3VLForSequenceClassificationConfig(Qwen3ForSequenceClassificationConfig):
+    pass
+
+
 class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
    @staticmethod
    def verify_and_update_model_config(model_config: "ModelConfig") -> None:
@@ -551,6 +555,7 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
    "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
    "Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
    "Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig,
+    "Qwen3VLForSequenceClassification": Qwen3VLForSequenceClassificationConfig,
    "XLMRobertaModel": JinaRobertaModelConfig,
    "JinaVLForRanking": JinaVLForSequenceClassificationConfig,
    "JambaForSequenceClassification": JambaForSequenceClassificationConfig,