173 lines
5.8 KiB
Python
173 lines
5.8 KiB
Python
# SPDX-License-Identifier: Apache-2.0
|
|
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
|
"""
|
|
This example shows how to use vLLM for running offline inference with
|
|
vision language reranker models for multimodal scoring tasks.
|
|
|
|
Vision language rerankers score the relevance between a text query and
|
|
multimodal documents (text + images/videos).
|
|
"""
|
|
|
|
from argparse import Namespace
|
|
from collections.abc import Callable
|
|
from dataclasses import asdict
|
|
from pathlib import Path
|
|
from typing import NamedTuple
|
|
|
|
from vllm import LLM, EngineArgs
|
|
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
|
|
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
|
|
|
TEMPLATE_HOME = Path(__file__).parent / "template"
|
|
|
|
|
|
class RerankModelData(NamedTuple):
|
|
engine_args: EngineArgs
|
|
chat_template: str | None = None
|
|
|
|
|
|
def run_jinavl_reranker(modality: str) -> RerankModelData:
|
|
assert modality == "image"
|
|
|
|
engine_args = EngineArgs(
|
|
model="jinaai/jina-reranker-m0",
|
|
runner="pooling",
|
|
max_model_len=32768,
|
|
trust_remote_code=True,
|
|
mm_processor_kwargs={
|
|
"min_pixels": 3136,
|
|
"max_pixels": 602112,
|
|
},
|
|
limit_mm_per_prompt={modality: 1},
|
|
)
|
|
return RerankModelData(
|
|
engine_args=engine_args,
|
|
)
|
|
|
|
|
|
def run_qwen3_vl_reranker(modality: str) -> RerankModelData:
|
|
engine_args = EngineArgs(
|
|
model="Qwen/Qwen3-VL-Reranker-2B",
|
|
runner="pooling",
|
|
max_model_len=16384,
|
|
limit_mm_per_prompt={modality: 1},
|
|
# HuggingFace model configuration overrides required for compatibility
|
|
hf_overrides={
|
|
# Manually route to sequence classification architecture
|
|
# This tells vLLM to use Qwen3VLForSequenceClassification instead of
|
|
# the default Qwen3VLForConditionalGeneration
|
|
"architectures": ["Qwen3VLForSequenceClassification"],
|
|
# Specify which token logits to extract from the language model head
|
|
# The original reranker uses "no" and "yes" token logits for scoring
|
|
"classifier_from_token": ["no", "yes"],
|
|
# Enable special handling for original Qwen3-Reranker models
|
|
# This flag triggers conversion logic that transforms the two token
|
|
# vectors into a single classification vector
|
|
"is_original_qwen3_reranker": True,
|
|
},
|
|
)
|
|
chat_template_path = "qwen3_vl_reranker.jinja"
|
|
chat_template = (TEMPLATE_HOME / chat_template_path).read_text()
|
|
return RerankModelData(
|
|
engine_args=engine_args,
|
|
chat_template=chat_template,
|
|
)
|
|
|
|
|
|
model_example_map: dict[str, Callable[[str], RerankModelData]] = {
|
|
"jinavl_reranker": run_jinavl_reranker,
|
|
"qwen3_vl_reranker": run_qwen3_vl_reranker,
|
|
}
|
|
|
|
|
|
def parse_args():
|
|
parser = FlexibleArgumentParser(
|
|
description="Demo on using vLLM for offline inference with "
|
|
"vision language reranker models for multimodal scoring tasks."
|
|
)
|
|
parser.add_argument(
|
|
"--model-name",
|
|
"-m",
|
|
type=str,
|
|
default="jinavl_reranker",
|
|
choices=model_example_map.keys(),
|
|
help="The name of the reranker model.",
|
|
)
|
|
parser.add_argument(
|
|
"--modality",
|
|
type=str,
|
|
default="image",
|
|
choices=["image", "video"],
|
|
help="Modality of the multimodal input (image or video).",
|
|
)
|
|
return parser.parse_args()
|
|
|
|
|
|
def get_multi_modal_input(modality: str) -> tuple[str, ScoreMultiModalParam]:
|
|
# Sample query for testing the reranker
|
|
if modality == "image":
|
|
query = "A woman playing with her dog on a beach at sunset."
|
|
# Sample multimodal documents to be scored against the query
|
|
# Each document contains an image URL that will be fetched and processed
|
|
documents: ScoreMultiModalParam = {
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": (
|
|
"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, " # noqa: E501
|
|
"as the dog offers its paw in a heartwarming display of companionship and trust." # noqa: E501
|
|
),
|
|
},
|
|
{
|
|
"type": "image_url",
|
|
"image_url": {
|
|
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
|
|
},
|
|
},
|
|
]
|
|
}
|
|
elif modality == "video":
|
|
query = "A girl is drawing pictures on an ipad."
|
|
# Sample video documents to be scored against the query
|
|
documents: ScoreMultiModalParam = {
|
|
"content": [
|
|
{
|
|
"type": "text",
|
|
"text": "A girl is drawing a guitar on her ipad with Apple Pencil.",
|
|
},
|
|
{
|
|
"type": "video_url",
|
|
"video_url": {
|
|
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
|
|
},
|
|
},
|
|
]
|
|
}
|
|
else:
|
|
raise ValueError(f"Unsupported modality: {modality}")
|
|
return query, documents
|
|
|
|
|
|
def main(args: Namespace):
|
|
# Run the selected reranker model
|
|
modality = args.modality
|
|
model_request = model_example_map[args.model_name](modality)
|
|
engine_args = model_request.engine_args
|
|
|
|
llm = LLM(**asdict(engine_args))
|
|
|
|
query, documents = get_multi_modal_input(modality)
|
|
outputs = llm.score(query, documents, chat_template=model_request.chat_template)
|
|
|
|
print("-" * 50)
|
|
print(f"Model: {engine_args.model}")
|
|
print(f"Modality: {modality}")
|
|
print(f"Query: {query}")
|
|
print("Relevance scores:", [output.outputs.score for output in outputs])
|
|
print("-" * 50)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
args = parse_args()
|
|
main(args)
|