[Models] Allow converting Qwen3-VL into Reranker model (#31890)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -800,6 +800,7 @@ The following table lists those that are tested in vLLM.
|
||||
| `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | |
|
||||
| `LlavaNextForConditionalGeneration`<sup>C</sup> | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ |
|
||||
| `Phi3VForCausalLM`<sup>C</sup> | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ |
|
||||
| `Qwen3VLForConditionalGeneration`<sup>C</sup> | Qwen3-VL | T + I + V | `Qwen/Qwen3-VL-Embedding-2B`, etc. | ✅︎ | ✅︎ |
|
||||
| `SiglipModel` | SigLIP, SigLIP2 | T / I | `google/siglip-base-patch16-224`, `google/siglip2-base-patch16-224` | | |
|
||||
| `*ForConditionalGeneration`<sup>C</sup>, `*ForCausalLM`<sup>C</sup>, etc. | Generative models | \* | N/A | \* | \* |
|
||||
|
||||
@@ -816,10 +817,18 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A
|
||||
| Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) |
|
||||
|--------------|--------|--------|-------------------|----------------------|---------------------------|
|
||||
| `JinaVLForSequenceClassification` | JinaVL-based | T + I<sup>E+</sup> | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ |
|
||||
| `Qwen3VLForSequenceClassification` | Qwen3-VL-Reranker | T + I<sup>E+</sup> + V<sup>E+</sup> | `Qwen/Qwen3-VL-Reranker-2B`(see note), etc. | ✅︎ | ✅︎ |
|
||||
|
||||
<sup>C</sup> Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion))
|
||||
\* Feature support is the same as that of the original model.
|
||||
|
||||
!!! note
|
||||
Similar to Qwen3-Reranker, you need to use the following `--hf_overrides` to load the official original `Qwen3-VL-Reranker`.
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen3-VL-Reranker-2B --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}'
|
||||
```
|
||||
|
||||
## Model Support Policy
|
||||
|
||||
At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support:
|
||||
|
||||
@@ -133,6 +133,36 @@ def run_jinavl_reranker(query: Query) -> ModelRequestData:
|
||||
)
|
||||
|
||||
|
||||
def run_qwen3_vl(query: Query) -> ModelRequestData:
|
||||
image_placeholder = "<vision_start><|image_pad|><vision_end>"
|
||||
if query["modality"] == "text":
|
||||
prompt = query["text"]
|
||||
image = None
|
||||
elif query["modality"] == "image":
|
||||
prompt = image_placeholder
|
||||
image = query["image"]
|
||||
elif query["modality"] == "text+image":
|
||||
text = query["text"]
|
||||
prompt = f"{image_placeholder}\n{text}"
|
||||
image = query["image"]
|
||||
else:
|
||||
modality = query["modality"]
|
||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="Qwen/Qwen3-VL-Embedding-2B",
|
||||
runner="pooling",
|
||||
max_model_len=8192,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
|
||||
|
||||
def run_siglip(query: Query) -> ModelRequestData:
|
||||
if query["modality"] == "text":
|
||||
prompt = query["text"]
|
||||
@@ -353,6 +383,7 @@ model_example_map = {
|
||||
"clip": run_clip,
|
||||
"e5_v": run_e5_v,
|
||||
"jinavl_reranker": run_jinavl_reranker,
|
||||
"qwen3_vl": run_qwen3_vl,
|
||||
"siglip": run_siglip,
|
||||
"vlm2vec_phi3v": run_vlm2vec_phi3v,
|
||||
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
|
||||
|
||||
23
examples/pooling/score/template/qwen3_vl_reranker.jinja
Normal file
23
examples/pooling/score/template/qwen3_vl_reranker.jinja
Normal file
@@ -0,0 +1,23 @@
|
||||
<|im_start|>system
|
||||
Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|>
|
||||
<|im_start|>user
|
||||
<Instruct>: {{
|
||||
messages
|
||||
| selectattr("role", "eq", "system")
|
||||
| map(attribute="content")
|
||||
| first
|
||||
| default("Given a search query, retrieve relevant candidates that answer the query.")
|
||||
}}<Query>:{{
|
||||
messages
|
||||
| selectattr("role", "eq", "query")
|
||||
| map(attribute="content")
|
||||
| first
|
||||
}}
|
||||
<Document>:{{
|
||||
messages
|
||||
| selectattr("role", "eq", "document")
|
||||
| map(attribute="content")
|
||||
| first
|
||||
}}<|im_end|>
|
||||
<|im_start|>assistant
|
||||
|
||||
172
examples/pooling/score/vision_language_reranker.py
Normal file
172
examples/pooling/score/vision_language_reranker.py
Normal file
@@ -0,0 +1,172 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This example shows how to use vLLM for running offline inference with
|
||||
vision language reranker models for multimodal scoring tasks.
|
||||
|
||||
Vision language rerankers score the relevance between a text query and
|
||||
multimodal documents (text + images/videos).
|
||||
"""
|
||||
|
||||
from argparse import Namespace
|
||||
from collections.abc import Callable
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
from typing import NamedTuple
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.entrypoints.score_utils import ScoreMultiModalParam
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
TEMPLATE_HOME = Path(__file__).parent / "template"
|
||||
|
||||
|
||||
class RerankModelData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
chat_template: str | None = None
|
||||
|
||||
|
||||
def run_jinavl_reranker(modality: str) -> RerankModelData:
|
||||
assert modality == "image"
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="jinaai/jina-reranker-m0",
|
||||
runner="pooling",
|
||||
max_model_len=32768,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 602112,
|
||||
},
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
return RerankModelData(
|
||||
engine_args=engine_args,
|
||||
)
|
||||
|
||||
|
||||
def run_qwen3_vl_reranker(modality: str) -> RerankModelData:
|
||||
engine_args = EngineArgs(
|
||||
model="Qwen/Qwen3-VL-Reranker-2B",
|
||||
runner="pooling",
|
||||
max_model_len=16384,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
# HuggingFace model configuration overrides required for compatibility
|
||||
hf_overrides={
|
||||
# Manually route to sequence classification architecture
|
||||
# This tells vLLM to use Qwen3VLForSequenceClassification instead of
|
||||
# the default Qwen3VLForConditionalGeneration
|
||||
"architectures": ["Qwen3VLForSequenceClassification"],
|
||||
# Specify which token logits to extract from the language model head
|
||||
# The original reranker uses "no" and "yes" token logits for scoring
|
||||
"classifier_from_token": ["no", "yes"],
|
||||
# Enable special handling for original Qwen3-Reranker models
|
||||
# This flag triggers conversion logic that transforms the two token
|
||||
# vectors into a single classification vector
|
||||
"is_original_qwen3_reranker": True,
|
||||
},
|
||||
)
|
||||
chat_template_path = "qwen3_vl_reranker.jinja"
|
||||
chat_template = (TEMPLATE_HOME / chat_template_path).read_text()
|
||||
return RerankModelData(
|
||||
engine_args=engine_args,
|
||||
chat_template=chat_template,
|
||||
)
|
||||
|
||||
|
||||
model_example_map: dict[str, Callable[[str], RerankModelData]] = {
|
||||
"jinavl_reranker": run_jinavl_reranker,
|
||||
"qwen3_vl_reranker": run_qwen3_vl_reranker,
|
||||
}
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Demo on using vLLM for offline inference with "
|
||||
"vision language reranker models for multimodal scoring tasks."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model-name",
|
||||
"-m",
|
||||
type=str,
|
||||
default="jinavl_reranker",
|
||||
choices=model_example_map.keys(),
|
||||
help="The name of the reranker model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--modality",
|
||||
type=str,
|
||||
default="image",
|
||||
choices=["image", "video"],
|
||||
help="Modality of the multimodal input (image or video).",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def get_multi_modal_input(modality: str) -> tuple[str, ScoreMultiModalParam]:
|
||||
# Sample query for testing the reranker
|
||||
if modality == "image":
|
||||
query = "A woman playing with her dog on a beach at sunset."
|
||||
# Sample multimodal documents to be scored against the query
|
||||
# Each document contains an image URL that will be fetched and processed
|
||||
documents: ScoreMultiModalParam = {
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": (
|
||||
"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, " # noqa: E501
|
||||
"as the dog offers its paw in a heartwarming display of companionship and trust." # noqa: E501
|
||||
),
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
|
||||
},
|
||||
},
|
||||
]
|
||||
}
|
||||
elif modality == "video":
|
||||
query = "A girl is drawing pictures on an ipad."
|
||||
# Sample video documents to be scored against the query
|
||||
documents: ScoreMultiModalParam = {
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "A girl is drawing a guitar on her ipad with Apple Pencil.",
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
|
||||
},
|
||||
},
|
||||
]
|
||||
}
|
||||
else:
|
||||
raise ValueError(f"Unsupported modality: {modality}")
|
||||
return query, documents
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
# Run the selected reranker model
|
||||
modality = args.modality
|
||||
model_request = model_example_map[args.model_name](modality)
|
||||
engine_args = model_request.engine_args
|
||||
|
||||
llm = LLM(**asdict(engine_args))
|
||||
|
||||
query, documents = get_multi_modal_input(modality)
|
||||
outputs = llm.score(query, documents, chat_template=model_request.chat_template)
|
||||
|
||||
print("-" * 50)
|
||||
print(f"Model: {engine_args.model}")
|
||||
print(f"Modality: {modality}")
|
||||
print(f"Query: {query}")
|
||||
print("Relevance scores:", [output.outputs.score for output in outputs])
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@@ -612,6 +612,15 @@ _AUTOMATIC_CONVERTED_MODELS = {
|
||||
"tomaarsen/Qwen3-Reranker-0.6B-seq-cls"
|
||||
),
|
||||
"Qwen3ForTokenClassification": _HfExamplesInfo("bd2lcco/Qwen3-0.6B-finetuned"),
|
||||
"Qwen3VLForSequenceClassification": _HfExamplesInfo(
|
||||
"Qwen/Qwen3-VL-Reranker-2B",
|
||||
is_available_online=False,
|
||||
hf_overrides={
|
||||
"architectures": ["Qwen3VLForSequenceClassification"],
|
||||
"classifier_from_token": ["no", "yes"],
|
||||
"is_original_qwen3_reranker": True,
|
||||
},
|
||||
),
|
||||
}
|
||||
|
||||
_MULTIMODAL_EXAMPLE_MODELS = {
|
||||
|
||||
@@ -11,6 +11,7 @@ from vllm.entrypoints.chat_utils import (
|
||||
ChatCompletionContentPartImageEmbedsParam,
|
||||
ChatCompletionContentPartImageParam,
|
||||
ChatCompletionContentPartTextParam,
|
||||
ChatCompletionContentPartVideoParam,
|
||||
ChatTemplateResolutionError,
|
||||
MultiModalItemTracker,
|
||||
_ContentPart,
|
||||
@@ -27,6 +28,7 @@ ScoreContentPartParam: TypeAlias = (
|
||||
ChatCompletionContentPartImageParam
|
||||
| ChatCompletionContentPartImageEmbedsParam
|
||||
| ChatCompletionContentPartTextParam
|
||||
| ChatCompletionContentPartVideoParam
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -333,9 +333,14 @@ def as_seq_cls_model(cls: _T) -> _T:
|
||||
)
|
||||
|
||||
def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
text_config = self.config.get_text_config()
|
||||
tokens = getattr(text_config, "classifier_from_token", None)
|
||||
method = getattr(text_config, "method", None)
|
||||
hf_config = self.config
|
||||
text_config = hf_config.get_text_config()
|
||||
tokens = getattr(
|
||||
hf_config,
|
||||
"classifier_from_token",
|
||||
getattr(text_config, "classifier_from_token", None),
|
||||
)
|
||||
method = getattr(hf_config, "method", getattr(text_config, "method", None))
|
||||
|
||||
def auto_set_score_bias(weights):
|
||||
for name, weight in weights:
|
||||
@@ -366,9 +371,14 @@ def as_seq_cls_model(cls: _T) -> _T:
|
||||
class SequenceClassificationConfig(VerifyAndUpdateConfig):
|
||||
@staticmethod
|
||||
def verify_and_update_config(vllm_config: "VllmConfig") -> None:
|
||||
text_config = vllm_config.model_config.hf_config.get_text_config()
|
||||
method = getattr(text_config, "method", None)
|
||||
tokens = getattr(text_config, "classifier_from_token", None)
|
||||
hf_config = vllm_config.model_config.hf_config
|
||||
text_config = hf_config.get_text_config()
|
||||
method = getattr(hf_config, "method", getattr(text_config, "method", None))
|
||||
tokens = getattr(
|
||||
hf_config,
|
||||
"classifier_from_token",
|
||||
getattr(text_config, "classifier_from_token", None),
|
||||
)
|
||||
|
||||
if method is None:
|
||||
return
|
||||
@@ -378,8 +388,10 @@ class SequenceClassificationConfig(VerifyAndUpdateConfig):
|
||||
|
||||
if method == "from_2_way_softmax":
|
||||
assert len(tokens) == 2
|
||||
hf_config.num_labels = 1
|
||||
text_config.num_labels = 1
|
||||
else:
|
||||
hf_config.num_labels = len(tokens)
|
||||
text_config.num_labels = len(tokens)
|
||||
|
||||
# `llm as reranker` defaults to not using separating token.
|
||||
@@ -396,9 +408,14 @@ def load_weights_using_from_2_way_softmax(
|
||||
|
||||
model_config = model.vllm_config.model_config
|
||||
quant_config = model.vllm_config.quant_config
|
||||
text_config = model.config.get_text_config()
|
||||
hf_config = model.config
|
||||
text_config = hf_config.get_text_config()
|
||||
|
||||
tokens = getattr(text_config, "classifier_from_token", [])
|
||||
tokens = getattr(
|
||||
hf_config,
|
||||
"classifier_from_token",
|
||||
getattr(text_config, "classifier_from_token", []),
|
||||
)
|
||||
tokens = cast(list[int], tokens)
|
||||
assert len(tokens) == 2
|
||||
|
||||
@@ -409,10 +426,15 @@ def load_weights_using_from_2_way_softmax(
|
||||
# embed_tokens is the assumed name for input embeddings. If the model does not
|
||||
# have this attribute, we fall back to get_input_embeddings(), which is used by
|
||||
# the Transformers modeling backend.
|
||||
text_backbone = (
|
||||
model.get_language_model().model
|
||||
if hasattr(model, "get_language_model")
|
||||
else model.model
|
||||
)
|
||||
embed_tokens = (
|
||||
model.model.embed_tokens
|
||||
if hasattr(model.model, "embed_tokens")
|
||||
else model.model.get_input_embeddings()
|
||||
text_backbone.embed_tokens
|
||||
if hasattr(text_backbone, "embed_tokens")
|
||||
else text_backbone.get_input_embeddings()
|
||||
)
|
||||
model.lm_head = model.lm_head.tie_weights(embed_tokens)
|
||||
|
||||
@@ -516,8 +538,9 @@ def seq_cls_model_loader(model, weights: Iterable[tuple[str, torch.Tensor]]):
|
||||
# - GemmaForCausalLM
|
||||
# - bge-reranker-v2-gemma
|
||||
|
||||
text_config = model.vllm_config.model_config.hf_config.get_text_config()
|
||||
method = getattr(text_config, "method", None)
|
||||
hf_config = model.vllm_config.model_config.hf_config
|
||||
text_config = hf_config.get_text_config()
|
||||
method = getattr(hf_config, "method", getattr(text_config, "method", None))
|
||||
assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported"
|
||||
return SEQ_CLS_LOAD_METHODS[method](model, weights)
|
||||
|
||||
|
||||
@@ -256,6 +256,10 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig):
|
||||
model_config.hf_config.method = "from_2_way_softmax"
|
||||
|
||||
|
||||
class Qwen3VLForSequenceClassificationConfig(Qwen3ForSequenceClassificationConfig):
|
||||
pass
|
||||
|
||||
|
||||
class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig):
|
||||
@staticmethod
|
||||
def verify_and_update_model_config(model_config: "ModelConfig") -> None:
|
||||
@@ -551,6 +555,7 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = {
|
||||
"Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig,
|
||||
"Qwen2ForRewardModel": Qwen2ForRewardModelConfig,
|
||||
"Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig,
|
||||
"Qwen3VLForSequenceClassification": Qwen3VLForSequenceClassificationConfig,
|
||||
"XLMRobertaModel": JinaRobertaModelConfig,
|
||||
"JinaVLForRanking": JinaVLForSequenceClassificationConfig,
|
||||
"JambaForSequenceClassification": JambaForSequenceClassificationConfig,
|
||||
|
||||
Reference in New Issue
Block a user