diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md
index 07b1ced5c..5c3668392 100644
--- a/docs/models/supported_models.md
+++ b/docs/models/supported_models.md
@@ -800,6 +800,7 @@ The following table lists those that are tested in vLLM.
| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
| `LlavaNextForConditionalGeneration`C | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
| `Phi3VForCausalLM`C | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
+| `Qwen3VLForConditionalGeneration`C | Qwen3-VL | T + I + V | `Qwen/Qwen3-VL-Embedding-2B`, etc. | ✅︎ | ✅︎ |
| `SiglipModel` | SigLIP, SigLIP2 | T / I | `google/siglip-base-patch16-224`, `google/siglip2-base-patch16-224` | | |
| `*ForConditionalGeneration`C, `*ForCausalLM`C, etc. | Generative models | \* | N/A | \* | \* |
@@ -816,10 +817,18 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|--------------|--------|--------|-------------------|----------------------|---------------------------|
| `JinaVLForSequenceClassification` | JinaVL-based | T + IE+ | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ |
+| `Qwen3VLForSequenceClassification` | Qwen3-VL-Reranker | T + IE+ + VE+ | `Qwen/Qwen3-VL-Reranker-2B`(see note), etc. | ✅︎ | ✅︎ |
C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))
\* Feature support is the same as that of the original model.
+!!! note
+ Similar to Qwen3-Reranker, you need to use the following `--hf_overrides` to load the official original `Qwen3-VL-Reranker`.
+
+ ```bash
+ vllm serve Qwen/Qwen3-VL-Reranker-2B --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
+ ```
+
## Model Support Policy
At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
diff --git a/examples/pooling/pooling/vision_language_pooling.py b/examples/pooling/pooling/vision_language_pooling.py
index dda56bc34..e2149a7a6 100644
--- a/examples/pooling/pooling/vision_language_pooling.py
+++ b/examples/pooling/pooling/vision_language_pooling.py
@@ -133,6 +133,36 @@ def run_jinavl_reranker(query: Query) -> ModelRequestData:
)
+def run_qwen3_vl(query: Query) -> ModelRequestData:
+ image_placeholder = "<|image_pad|>"
+ if query["modality"] == "text":
+ prompt = query["text"]
+ image = None
+ elif query["modality"] == "image":
+ prompt = image_placeholder
+ image = query["image"]
+ elif query["modality"] == "text+image":
+ text = query["text"]
+ prompt = f"{image_placeholder}\n{text}"
+ image = query["image"]
+ else:
+ modality = query["modality"]
+ raise ValueError(f"Unsupported query modality: '{modality}'")
+
+ engine_args = EngineArgs(
+ model="Qwen/Qwen3-VL-Embedding-2B",
+ runner="pooling",
+ max_model_len=8192,
+ limit_mm_per_prompt={"image": 1},
+ )
+
+ return ModelRequestData(
+ engine_args=engine_args,
+ prompt=prompt,
+ image=image,
+ )
+
+
def run_siglip(query: Query) -> ModelRequestData:
if query["modality"] == "text":
prompt = query["text"]
@@ -353,6 +383,7 @@ model_example_map = {
"clip": run_clip,
"e5_v": run_e5_v,
"jinavl_reranker": run_jinavl_reranker,
+ "qwen3_vl": run_qwen3_vl,
"siglip": run_siglip,
"vlm2vec_phi3v": run_vlm2vec_phi3v,
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
diff --git a/examples/pooling/score/template/qwen3_vl_reranker.jinja b/examples/pooling/score/template/qwen3_vl_reranker.jinja
new file mode 100644
index 000000000..ed89f2a54
--- /dev/null
+++ b/examples/pooling/score/template/qwen3_vl_reranker.jinja
@@ -0,0 +1,23 @@
+<|im_start|>system
+Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>
+<|im_start|>user
+: {{
+ messages
+ | selectattr("role", "eq", "system")
+ | map(attribute="content")
+ | first
+ | default("Given a search query, retrieve relevant candidates that answer the query.")
+}}:{{
+ messages
+ | selectattr("role", "eq", "query")
+ | map(attribute="content")
+ | first
+}}
+:{{
+ messages
+ | selectattr("role", "eq", "document")
+ | map(attribute="content")
+ | first
+}}<|im_end|>
+<|im_start|>assistant
+
diff --git a/examples/pooling/score/vision_language_reranker.py b/examples/pooling/score/vision_language_reranker.py
new file mode 100644
index 000000000..657aced98
--- /dev/null
+++ b/examples/pooling/score/vision_language_reranker.py
@@ -0,0 +1,172 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+"""
+This example shows how to use vLLM for running offline inference with
+vision language reranker models for multimodal scoring tasks.
+
+Vision language rerankers score the relevance between a text query and
+multimodal documents (text + images/videos).
+"""
+
+from argparse import Namespace
+from collections.abc import Callable
+from dataclasses import asdict
+from pathlib import Path
+from typing import NamedTuple
+
+from vllm import LLM, EngineArgs
+from vllm.entrypoints.score_utils import ScoreMultiModalParam
+from vllm.utils.argparse_utils import FlexibleArgumentParser
+
+TEMPLATE_HOME = Path(__file__).parent / "template"
+
+
+class RerankModelData(NamedTuple):
+ engine_args: EngineArgs
+ chat_template: str | None = None
+
+
+def run_jinavl_reranker(modality: str) -> RerankModelData:
+ assert modality == "image"
+
+ engine_args = EngineArgs(
+ model="jinaai/jina-reranker-m0",
+ runner="pooling",
+ max_model_len=32768,
+ trust_remote_code=True,
+ mm_processor_kwargs={
+ "min_pixels": 3136,
+ "max_pixels": 602112,
+ },
+ limit_mm_per_prompt={modality: 1},
+ )
+ return RerankModelData(
+ engine_args=engine_args,
+ )
+
+
+def run_qwen3_vl_reranker(modality: str) -> RerankModelData:
+ engine_args = EngineArgs(
+ model="Qwen/Qwen3-VL-Reranker-2B",
+ runner="pooling",
+ max_model_len=16384,
+ limit_mm_per_prompt={modality: 1},
+ # HuggingFace model configuration overrides required for compatibility
+ hf_overrides={
+ # Manually route to sequence classification architecture
+ # This tells vLLM to use Qwen3VLForSequenceClassification instead of
+ # the default Qwen3VLForConditionalGeneration
+ "architectures": ["Qwen3VLForSequenceClassification"],
+ # Specify which token logits to extract from the language model head
+ # The original reranker uses "no" and "yes" token logits for scoring
+ "classifier_from_token": ["no", "yes"],
+ # Enable special handling for original Qwen3-Reranker models
+ # This flag triggers conversion logic that transforms the two token
+ # vectors into a single classification vector
+ "is_original_qwen3_reranker": True,
+ },
+ )
+ chat_template_path = "qwen3_vl_reranker.jinja"
+ chat_template = (TEMPLATE_HOME / chat_template_path).read_text()
+ return RerankModelData(
+ engine_args=engine_args,
+ chat_template=chat_template,
+ )
+
+
+model_example_map: dict[str, Callable[[str], RerankModelData]] = {
+ "jinavl_reranker": run_jinavl_reranker,
+ "qwen3_vl_reranker": run_qwen3_vl_reranker,
+}
+
+
+def parse_args():
+ parser = FlexibleArgumentParser(
+ description="Demo on using vLLM for offline inference with "
+ "vision language reranker models for multimodal scoring tasks."
+ )
+ parser.add_argument(
+ "--model-name",
+ "-m",
+ type=str,
+ default="jinavl_reranker",
+ choices=model_example_map.keys(),
+ help="The name of the reranker model.",
+ )
+ parser.add_argument(
+ "--modality",
+ type=str,
+ default="image",
+ choices=["image", "video"],
+ help="Modality of the multimodal input (image or video).",
+ )
+ return parser.parse_args()
+
+
+def get_multi_modal_input(modality: str) -> tuple[str, ScoreMultiModalParam]:
+ # Sample query for testing the reranker
+ if modality == "image":
+ query = "A woman playing with her dog on a beach at sunset."
+ # Sample multimodal documents to be scored against the query
+ # Each document contains an image URL that will be fetched and processed
+ documents: ScoreMultiModalParam = {
+ "content": [
+ {
+ "type": "text",
+ "text": (
+ "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, " # noqa: E501
+ "as the dog offers its paw in a heartwarming display of companionship and trust." # noqa: E501
+ ),
+ },
+ {
+ "type": "image_url",
+ "image_url": {
+ "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+ },
+ },
+ ]
+ }
+ elif modality == "video":
+ query = "A girl is drawing pictures on an ipad."
+ # Sample video documents to be scored against the query
+ documents: ScoreMultiModalParam = {
+ "content": [
+ {
+ "type": "text",
+ "text": "A girl is drawing a guitar on her ipad with Apple Pencil.",
+ },
+ {
+ "type": "video_url",
+ "video_url": {
+ "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
+ },
+ },
+ ]
+ }
+ else:
+ raise ValueError(f"Unsupported modality: {modality}")
+ return query, documents
+
+
+def main(args: Namespace):
+ # Run the selected reranker model
+ modality = args.modality
+ model_request = model_example_map[args.model_name](modality)
+ engine_args = model_request.engine_args
+
+ llm = LLM(**asdict(engine_args))
+
+ query, documents = get_multi_modal_input(modality)
+ outputs = llm.score(query, documents, chat_template=model_request.chat_template)
+
+ print("-" * 50)
+ print(f"Model: {engine_args.model}")
+ print(f"Modality: {modality}")
+ print(f"Query: {query}")
+ print("Relevance scores:", [output.outputs.score for output in outputs])
+ print("-" * 50)
+
+
+if __name__ == "__main__":
+ args = parse_args()
+ main(args)
diff --git a/tests/models/registry.py b/tests/models/registry.py
index 570bcc734..9778678b3 100644
--- a/tests/models/registry.py
+++ b/tests/models/registry.py
@@ -612,6 +612,15 @@ _AUTOMATIC_CONVERTED_MODELS = {
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
),
"Qwen3ForTokenClassification": _HfExamplesInfo("bd2lcco/Qwen3-0.6B-finetuned"),
+ "Qwen3VLForSequenceClassification": _HfExamplesInfo(
+ "Qwen/Qwen3-VL-Reranker-2B",
+ is_available_online=False,
+ hf_overrides={
+ "architectures": ["Qwen3VLForSequenceClassification"],
+ "classifier_from_token": ["no", "yes"],
+ "is_original_qwen3_reranker": True,
+ },
+ ),
}
_MULTIMODAL_EXAMPLE_MODELS = {
diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py
index a3837d9d3..09ef8781b 100644
--- a/vllm/entrypoints/score_utils.py
+++ b/vllm/entrypoints/score_utils.py
@@ -11,6 +11,7 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageEmbedsParam,
ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam,
+ ChatCompletionContentPartVideoParam,
ChatTemplateResolutionError,
MultiModalItemTracker,
_ContentPart,
@@ -27,6 +28,7 @@ ScoreContentPartParam: TypeAlias = (
ChatCompletionContentPartImageParam
| ChatCompletionContentPartImageEmbedsParam
| ChatCompletionContentPartTextParam
+ | ChatCompletionContentPartVideoParam
)
diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py
index 43303aa76..07fa72561 100644
--- a/vllm/model_executor/models/adapters.py
+++ b/vllm/model_executor/models/adapters.py
@@ -333,9 +333,14 @@ def as_seq_cls_model(cls: _T) -> _T:
)
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
- text_config = self.config.get_text_config()
- tokens = getattr(text_config, "classifier_from_token", None)
- method = getattr(text_config, "method", None)
+ hf_config = self.config
+ text_config = hf_config.get_text_config()
+ tokens = getattr(
+ hf_config,
+ "classifier_from_token",
+ getattr(text_config, "classifier_from_token", None),
+ )
+ method = getattr(hf_config, "method", getattr(text_config, "method", None))
def auto_set_score_bias(weights):
for name, weight in weights:
@@ -366,9 +371,14 @@ def as_seq_cls_model(cls: _T) -> _T:
class SequenceClassificationConfig(VerifyAndUpdateConfig):
@staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
- text_config = vllm_config.model_config.hf_config.get_text_config()
- method = getattr(text_config, "method", None)
- tokens = getattr(text_config, "classifier_from_token", None)
+ hf_config = vllm_config.model_config.hf_config
+ text_config = hf_config.get_text_config()
+ method = getattr(hf_config, "method", getattr(text_config, "method", None))
+ tokens = getattr(
+ hf_config,
+ "classifier_from_token",
+ getattr(text_config, "classifier_from_token", None),
+ )
if method is None:
return
@@ -378,8 +388,10 @@ class SequenceClassificationConfig(VerifyAndUpdateConfig):
if method == "from_2_way_softmax":
assert len(tokens) == 2
+ hf_config.num_labels = 1
text_config.num_labels = 1
else:
+ hf_config.num_labels = len(tokens)
text_config.num_labels = len(tokens)
# `llm as reranker` defaults to not using separating token.
@@ -396,9 +408,14 @@ def load_weights_using_from_2_way_softmax(
model_config = model.vllm_config.model_config
quant_config = model.vllm_config.quant_config
- text_config = model.config.get_text_config()
+ hf_config = model.config
+ text_config = hf_config.get_text_config()
- tokens = getattr(text_config, "classifier_from_token", [])
+ tokens = getattr(
+ hf_config,
+ "classifier_from_token",
+ getattr(text_config, "classifier_from_token", []),
+ )
tokens = cast(list[int], tokens)
assert len(tokens) == 2
@@ -409,10 +426,15 @@ def load_weights_using_from_2_way_softmax(
# embed_tokens is the assumed name for input embeddings. If the model does not
# have this attribute, we fall back to get_input_embeddings(), which is used by
# the Transformers modeling backend.
+ text_backbone = (
+ model.get_language_model().model
+ if hasattr(model, "get_language_model")
+ else model.model
+ )
embed_tokens = (
- model.model.embed_tokens
- if hasattr(model.model, "embed_tokens")
- else model.model.get_input_embeddings()
+ text_backbone.embed_tokens
+ if hasattr(text_backbone, "embed_tokens")
+ else text_backbone.get_input_embeddings()
)
model.lm_head = model.lm_head.tie_weights(embed_tokens)
@@ -516,8 +538,9 @@ def seq_cls_model_loader(model, weights: Iterable[tuple[str, torch.Tensor]]):
# - GemmaForCausalLM
# - bge-reranker-v2-gemma
- text_config = model.vllm_config.model_config.hf_config.get_text_config()
- method = getattr(text_config, "method", None)
+ hf_config = model.vllm_config.model_config.hf_config
+ text_config = hf_config.get_text_config()
+ method = getattr(hf_config, "method", getattr(text_config, "method", None))
assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported"
return SEQ_CLS_LOAD_METHODS[method](model, weights)
diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py
index 362c194d8..9ef038d84 100644
--- a/vllm/model_executor/models/config.py
+++ b/vllm/model_executor/models/config.py
@@ -256,6 +256,10 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
model_config.hf_config.method = "from_2_way_softmax"
+class Qwen3VLForSequenceClassificationConfig(Qwen3ForSequenceClassificationConfig):
+ pass
+
+
class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
@staticmethod
def verify_and_update_model_config(model_config: "ModelConfig") -> None:
@@ -551,6 +555,7 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
"Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
"Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
"Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig,
+ "Qwen3VLForSequenceClassification": Qwen3VLForSequenceClassificationConfig,
"XLMRobertaModel": JinaRobertaModelConfig,
"JinaVLForRanking": JinaVLForSequenceClassificationConfig,
"JambaForSequenceClassification": JambaForSequenceClassificationConfig,