[Models] Allow converting Qwen3-VL into Reranker model (#31890)

Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
Isotr0py
2026-01-08 16:10:15 +08:00
committed by GitHub
parent 573a1d1119
commit eac3b96ec0
8 changed files with 287 additions and 13 deletions

View File

@@ -800,6 +800,7 @@ The following table lists those that are tested in vLLM.
| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | | | `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | | `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | | `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
| `Qwen3VLForConditionalGeneration`<sup>C</sup> | Qwen3-VL | T + I + V | `Qwen/Qwen3-VL-Embedding-2B`, etc. | ✅︎ | ✅︎ |
| `SiglipModel` | SigLIP, SigLIP2 | T / I | `google/siglip-base-patch16-224`, `google/siglip2-base-patch16-224` | | | | `SiglipModel` | SigLIP, SigLIP2 | T / I | `google/siglip-base-patch16-224`, `google/siglip2-base-patch16-224` | | |
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* | | `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |
@@ -816,10 +817,18 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|--------------|--------|--------|-------------------|----------------------|---------------------------| |--------------|--------|--------|-------------------|----------------------|---------------------------|
| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ | | `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ |
| `Qwen3VLForSequenceClassification` | Qwen3-VL-Reranker | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-Reranker-2B`(see note), etc. | ✅︎ | ✅︎ |
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) <sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))
\* Feature support is the same as that of the original model. \* Feature support is the same as that of the original model.
!!! note
Similar to Qwen3-Reranker, you need to use the following `--hf_overrides` to load the official original `Qwen3-VL-Reranker`.
```bash
vllm serve Qwen/Qwen3-VL-Reranker-2B --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
```
## Model Support Policy ## Model Support Policy
At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Heres how we manage third-party model support: At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Heres how we manage third-party model support:

View File

@@ -133,6 +133,36 @@ def run_jinavl_reranker(query: Query) -> ModelRequestData:
) )
def run_qwen3_vl(query: Query) -> ModelRequestData:
image_placeholder = "<vision_start><|image_pad|><vision_end>"
if query["modality"] == "text":
prompt = query["text"]
image = None
elif query["modality"] == "image":
prompt = image_placeholder
image = query["image"]
elif query["modality"] == "text+image":
text = query["text"]
prompt = f"{image_placeholder}\n{text}"
image = query["image"]
else:
modality = query["modality"]
raise ValueError(f"Unsupported query modality: '{modality}'")
engine_args = EngineArgs(
model="Qwen/Qwen3-VL-Embedding-2B",
runner="pooling",
max_model_len=8192,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image=image,
)
def run_siglip(query: Query) -> ModelRequestData: def run_siglip(query: Query) -> ModelRequestData:
if query["modality"] == "text": if query["modality"] == "text":
prompt = query["text"] prompt = query["text"]
@@ -353,6 +383,7 @@ model_example_map = {
"clip": run_clip, "clip": run_clip,
"e5_v": run_e5_v, "e5_v": run_e5_v,
"jinavl_reranker": run_jinavl_reranker, "jinavl_reranker": run_jinavl_reranker,
"qwen3_vl": run_qwen3_vl,
"siglip": run_siglip, "siglip": run_siglip,
"vlm2vec_phi3v": run_vlm2vec_phi3v, "vlm2vec_phi3v": run_vlm2vec_phi3v,
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl, "vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,

View File

@@ -0,0 +1,23 @@
<|im_start|>system
Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>
<|im_start|>user
<Instruct>: {{
messages
| selectattr("role", "eq", "system")
| map(attribute="content")
| first
| default("Given a search query, retrieve relevant candidates that answer the query.")
}}<Query>:{{
messages
| selectattr("role", "eq", "query")
| map(attribute="content")
| first
}}
<Document>:{{
messages
| selectattr("role", "eq", "document")
| map(attribute="content")
| first
}}<|im_end|>
<|im_start|>assistant

View File

@@ -0,0 +1,172 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference with
vision language reranker models for multimodal scoring tasks.
Vision language rerankers score the relevance between a text query and
multimodal documents (text + images/videos).
"""
from argparse import Namespace
from collections.abc import Callable
from dataclasses import asdict
from pathlib import Path
from typing import NamedTuple
from vllm import LLM, EngineArgs
from vllm.entrypoints.score_utils import ScoreMultiModalParam
from vllm.utils.argparse_utils import FlexibleArgumentParser
TEMPLATE_HOME = Path(__file__).parent / "template"
class RerankModelData(NamedTuple):
engine_args: EngineArgs
chat_template: str | None = None
def run_jinavl_reranker(modality: str) -> RerankModelData:
assert modality == "image"
engine_args = EngineArgs(
model="jinaai/jina-reranker-m0",
runner="pooling",
max_model_len=32768,
trust_remote_code=True,
mm_processor_kwargs={
"min_pixels": 3136,
"max_pixels": 602112,
},
limit_mm_per_prompt={modality: 1},
)
return RerankModelData(
engine_args=engine_args,
)
def run_qwen3_vl_reranker(modality: str) -> RerankModelData:
engine_args = EngineArgs(
model="Qwen/Qwen3-VL-Reranker-2B",
runner="pooling",
max_model_len=16384,
limit_mm_per_prompt={modality: 1},
# HuggingFace model configuration overrides required for compatibility
hf_overrides={
# Manually route to sequence classification architecture
# This tells vLLM to use Qwen3VLForSequenceClassification instead of
# the default Qwen3VLForConditionalGeneration
"architectures": ["Qwen3VLForSequenceClassification"],
# Specify which token logits to extract from the language model head
# The original reranker uses "no" and "yes" token logits for scoring
"classifier_from_token": ["no", "yes"],
# Enable special handling for original Qwen3-Reranker models
# This flag triggers conversion logic that transforms the two token
# vectors into a single classification vector
"is_original_qwen3_reranker": True,
},
)
chat_template_path = "qwen3_vl_reranker.jinja"
chat_template = (TEMPLATE_HOME / chat_template_path).read_text()
return RerankModelData(
engine_args=engine_args,
chat_template=chat_template,
)
model_example_map: dict[str, Callable[[str], RerankModelData]] = {
"jinavl_reranker": run_jinavl_reranker,
"qwen3_vl_reranker": run_qwen3_vl_reranker,
}
def parse_args():
parser = FlexibleArgumentParser(
description="Demo on using vLLM for offline inference with "
"vision language reranker models for multimodal scoring tasks."
)
parser.add_argument(
"--model-name",
"-m",
type=str,
default="jinavl_reranker",
choices=model_example_map.keys(),
help="The name of the reranker model.",
)
parser.add_argument(
"--modality",
type=str,
default="image",
choices=["image", "video"],
help="Modality of the multimodal input (image or video).",
)
return parser.parse_args()
def get_multi_modal_input(modality: str) -> tuple[str, ScoreMultiModalParam]:
# Sample query for testing the reranker
if modality == "image":
query = "A woman playing with her dog on a beach at sunset."
# Sample multimodal documents to be scored against the query
# Each document contains an image URL that will be fetched and processed
documents: ScoreMultiModalParam = {
"content": [
{
"type": "text",
"text": (
"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, " # noqa: E501
"as the dog offers its paw in a heartwarming display of companionship and trust." # noqa: E501
),
},
{
"type": "image_url",
"image_url": {
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
},
},
]
}
elif modality == "video":
query = "A girl is drawing pictures on an ipad."
# Sample video documents to be scored against the query
documents: ScoreMultiModalParam = {
"content": [
{
"type": "text",
"text": "A girl is drawing a guitar on her ipad with Apple Pencil.",
},
{
"type": "video_url",
"video_url": {
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
},
},
]
}
else:
raise ValueError(f"Unsupported modality: {modality}")
return query, documents
def main(args: Namespace):
# Run the selected reranker model
modality = args.modality
model_request = model_example_map[args.model_name](modality)
engine_args = model_request.engine_args
llm = LLM(**asdict(engine_args))
query, documents = get_multi_modal_input(modality)
outputs = llm.score(query, documents, chat_template=model_request.chat_template)
print("-" * 50)
print(f"Model: {engine_args.model}")
print(f"Modality: {modality}")
print(f"Query: {query}")
print("Relevance scores:", [output.outputs.score for output in outputs])
print("-" * 50)
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@@ -612,6 +612,15 @@ _AUTOMATIC_CONVERTED_MODELS = {
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls" "tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
), ),
"Qwen3ForTokenClassification": _HfExamplesInfo("bd2lcco/Qwen3-0.6B-finetuned"), "Qwen3ForTokenClassification": _HfExamplesInfo("bd2lcco/Qwen3-0.6B-finetuned"),
"Qwen3VLForSequenceClassification": _HfExamplesInfo(
"Qwen/Qwen3-VL-Reranker-2B",
is_available_online=False,
hf_overrides={
"architectures": ["Qwen3VLForSequenceClassification"],
"classifier_from_token": ["no", "yes"],
"is_original_qwen3_reranker": True,
},
),
} }
_MULTIMODAL_EXAMPLE_MODELS = { _MULTIMODAL_EXAMPLE_MODELS = {

View File

@@ -11,6 +11,7 @@ from vllm.entrypoints.chat_utils import (
ChatCompletionContentPartImageEmbedsParam, ChatCompletionContentPartImageEmbedsParam,
ChatCompletionContentPartImageParam, ChatCompletionContentPartImageParam,
ChatCompletionContentPartTextParam, ChatCompletionContentPartTextParam,
ChatCompletionContentPartVideoParam,
ChatTemplateResolutionError, ChatTemplateResolutionError,
MultiModalItemTracker, MultiModalItemTracker,
_ContentPart, _ContentPart,
@@ -27,6 +28,7 @@ ScoreContentPartParam: TypeAlias = (
ChatCompletionContentPartImageParam ChatCompletionContentPartImageParam
| ChatCompletionContentPartImageEmbedsParam | ChatCompletionContentPartImageEmbedsParam
| ChatCompletionContentPartTextParam | ChatCompletionContentPartTextParam
| ChatCompletionContentPartVideoParam
) )

View File

@@ -333,9 +333,14 @@ def as_seq_cls_model(cls: _T) -> _T:
) )
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
text_config = self.config.get_text_config() hf_config = self.config
tokens = getattr(text_config, "classifier_from_token", None) text_config = hf_config.get_text_config()
method = getattr(text_config, "method", None) tokens = getattr(
hf_config,
"classifier_from_token",
getattr(text_config, "classifier_from_token", None),
)
method = getattr(hf_config, "method", getattr(text_config, "method", None))
def auto_set_score_bias(weights): def auto_set_score_bias(weights):
for name, weight in weights: for name, weight in weights:
@@ -366,9 +371,14 @@ def as_seq_cls_model(cls: _T) -> _T:
class SequenceClassificationConfig(VerifyAndUpdateConfig): class SequenceClassificationConfig(VerifyAndUpdateConfig):
@staticmethod @staticmethod
def verify_and_update_config(vllm_config: "VllmConfig") -> None: def verify_and_update_config(vllm_config: "VllmConfig") -> None:
text_config = vllm_config.model_config.hf_config.get_text_config() hf_config = vllm_config.model_config.hf_config
method = getattr(text_config, "method", None) text_config = hf_config.get_text_config()
tokens = getattr(text_config, "classifier_from_token", None) method = getattr(hf_config, "method", getattr(text_config, "method", None))
tokens = getattr(
hf_config,
"classifier_from_token",
getattr(text_config, "classifier_from_token", None),
)
if method is None: if method is None:
return return
@@ -378,8 +388,10 @@ class SequenceClassificationConfig(VerifyAndUpdateConfig):
if method == "from_2_way_softmax": if method == "from_2_way_softmax":
assert len(tokens) == 2 assert len(tokens) == 2
hf_config.num_labels = 1
text_config.num_labels = 1 text_config.num_labels = 1
else: else:
hf_config.num_labels = len(tokens)
text_config.num_labels = len(tokens) text_config.num_labels = len(tokens)
# `llm as reranker` defaults to not using separating token. # `llm as reranker` defaults to not using separating token.
@@ -396,9 +408,14 @@ def load_weights_using_from_2_way_softmax(
model_config = model.vllm_config.model_config model_config = model.vllm_config.model_config
quant_config = model.vllm_config.quant_config quant_config = model.vllm_config.quant_config
text_config = model.config.get_text_config() hf_config = model.config
text_config = hf_config.get_text_config()
tokens = getattr(text_config, "classifier_from_token", []) tokens = getattr(
hf_config,
"classifier_from_token",
getattr(text_config, "classifier_from_token", []),
)
tokens = cast(list[int], tokens) tokens = cast(list[int], tokens)
assert len(tokens) == 2 assert len(tokens) == 2
@@ -409,10 +426,15 @@ def load_weights_using_from_2_way_softmax(
# embed_tokens is the assumed name for input embeddings. If the model does not # embed_tokens is the assumed name for input embeddings. If the model does not
# have this attribute, we fall back to get_input_embeddings(), which is used by # have this attribute, we fall back to get_input_embeddings(), which is used by
# the Transformers modeling backend. # the Transformers modeling backend.
text_backbone = (
model.get_language_model().model
if hasattr(model, "get_language_model")
else model.model
)
embed_tokens = ( embed_tokens = (
model.model.embed_tokens text_backbone.embed_tokens
if hasattr(model.model, "embed_tokens") if hasattr(text_backbone, "embed_tokens")
else model.model.get_input_embeddings() else text_backbone.get_input_embeddings()
) )
model.lm_head = model.lm_head.tie_weights(embed_tokens) model.lm_head = model.lm_head.tie_weights(embed_tokens)
@@ -516,8 +538,9 @@ def seq_cls_model_loader(model, weights: Iterable[tuple[str, torch.Tensor]]):
# - GemmaForCausalLM # - GemmaForCausalLM
# - bge-reranker-v2-gemma # - bge-reranker-v2-gemma
text_config = model.vllm_config.model_config.hf_config.get_text_config() hf_config = model.vllm_config.model_config.hf_config
method = getattr(text_config, "method", None) text_config = hf_config.get_text_config()
method = getattr(hf_config, "method", getattr(text_config, "method", None))
assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported" assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported"
return SEQ_CLS_LOAD_METHODS[method](model, weights) return SEQ_CLS_LOAD_METHODS[method](model, weights)

View File

@@ -256,6 +256,10 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
model_config.hf_config.method = "from_2_way_softmax" model_config.hf_config.method = "from_2_way_softmax"
class Qwen3VLForSequenceClassificationConfig(Qwen3ForSequenceClassificationConfig):
pass
class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig): class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
@staticmethod @staticmethod
def verify_and_update_model_config(model_config: "ModelConfig") -> None: def verify_and_update_model_config(model_config: "ModelConfig") -> None:
@@ -551,6 +555,7 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
"Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig, "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
"Qwen2ForRewardModel": Qwen2ForRewardModelConfig, "Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
"Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig, "Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig,
"Qwen3VLForSequenceClassification": Qwen3VLForSequenceClassificationConfig,
"XLMRobertaModel": JinaRobertaModelConfig, "XLMRobertaModel": JinaRobertaModelConfig,
"JinaVLForRanking": JinaVLForSequenceClassificationConfig, "JinaVLForRanking": JinaVLForSequenceClassificationConfig,
"JambaForSequenceClassification": JambaForSequenceClassificationConfig, "JambaForSequenceClassification": JambaForSequenceClassificationConfig,