diff --git a/docs/models/supported_models.md b/docs/models/supported_models.md index 07b1ced5c..5c3668392 100644 --- a/docs/models/supported_models.md +++ b/docs/models/supported_models.md @@ -800,6 +800,7 @@ The following table lists those that are tested in vLLM. | `CLIPModel` | CLIP | T / I | `openai/clip-vit-base-patch32`, `openai/clip-vit-large-patch14`, etc. | | | | `LlavaNextForConditionalGeneration`C | LLaVA-NeXT-based | T / I | `royokong/e5-v` | | ✅︎ | | `Phi3VForCausalLM`C | Phi-3-Vision-based | T + I | `TIGER-Lab/VLM2Vec-Full` | | ✅︎ | +| `Qwen3VLForConditionalGeneration`C | Qwen3-VL | T + I + V | `Qwen/Qwen3-VL-Embedding-2B`, etc. | ✅︎ | ✅︎ | | `SiglipModel` | SigLIP, SigLIP2 | T / I | `google/siglip-base-patch16-224`, `google/siglip2-base-patch16-224` | | | | `*ForConditionalGeneration`C, `*ForCausalLM`C, etc. | Generative models | \* | N/A | \* | \* | @@ -816,10 +817,18 @@ These models primarily support the [`LLM.score`](./pooling_models.md#llmscore) A | Architecture | Models | Inputs | Example HF Models | [LoRA](../features/lora.md) | [PP](../serving/parallelism_scaling.md) | |--------------|--------|--------|-------------------|----------------------|---------------------------| | `JinaVLForSequenceClassification` | JinaVL-based | T + IE+ | `jinaai/jina-reranker-m0`, etc. | ✅︎ | ✅︎ | +| `Qwen3VLForSequenceClassification` | Qwen3-VL-Reranker | T + IE+ + VE+ | `Qwen/Qwen3-VL-Reranker-2B`(see note), etc. | ✅︎ | ✅︎ | C Automatically converted into a classification model via `--convert classify`. ([details](./pooling_models.md#model-conversion)) \* Feature support is the same as that of the original model. +!!! note + Similar to Qwen3-Reranker, you need to use the following `--hf_overrides` to load the official original `Qwen3-VL-Reranker`. + + ```bash + vllm serve Qwen/Qwen3-VL-Reranker-2B --hf_overrides '{"architectures": ["Qwen3VLForSequenceClassification"],"classifier_from_token": ["no", "yes"],"is_original_qwen3_reranker": true}' + ``` + ## Model Support Policy At vLLM, we are committed to facilitating the integration and support of third-party models within our ecosystem. Our approach is designed to balance the need for robustness and the practical limitations of supporting a wide range of models. Here’s how we manage third-party model support: diff --git a/examples/pooling/pooling/vision_language_pooling.py b/examples/pooling/pooling/vision_language_pooling.py index dda56bc34..e2149a7a6 100644 --- a/examples/pooling/pooling/vision_language_pooling.py +++ b/examples/pooling/pooling/vision_language_pooling.py @@ -133,6 +133,36 @@ def run_jinavl_reranker(query: Query) -> ModelRequestData: ) +def run_qwen3_vl(query: Query) -> ModelRequestData: + image_placeholder = "<|image_pad|>" + if query["modality"] == "text": + prompt = query["text"] + image = None + elif query["modality"] == "image": + prompt = image_placeholder + image = query["image"] + elif query["modality"] == "text+image": + text = query["text"] + prompt = f"{image_placeholder}\n{text}" + image = query["image"] + else: + modality = query["modality"] + raise ValueError(f"Unsupported query modality: '{modality}'") + + engine_args = EngineArgs( + model="Qwen/Qwen3-VL-Embedding-2B", + runner="pooling", + max_model_len=8192, + limit_mm_per_prompt={"image": 1}, + ) + + return ModelRequestData( + engine_args=engine_args, + prompt=prompt, + image=image, + ) + + def run_siglip(query: Query) -> ModelRequestData: if query["modality"] == "text": prompt = query["text"] @@ -353,6 +383,7 @@ model_example_map = { "clip": run_clip, "e5_v": run_e5_v, "jinavl_reranker": run_jinavl_reranker, + "qwen3_vl": run_qwen3_vl, "siglip": run_siglip, "vlm2vec_phi3v": run_vlm2vec_phi3v, "vlm2vec_qwen2vl": run_vlm2vec_qwen2vl, diff --git a/examples/pooling/score/template/qwen3_vl_reranker.jinja b/examples/pooling/score/template/qwen3_vl_reranker.jinja new file mode 100644 index 000000000..ed89f2a54 --- /dev/null +++ b/examples/pooling/score/template/qwen3_vl_reranker.jinja @@ -0,0 +1,23 @@ +<|im_start|>system +Judge whether the Document meets the requirements based on the Query and the Instruct provided. Note that the answer can only be "yes" or "no".<|im_end|> +<|im_start|>user +: {{ + messages + | selectattr("role", "eq", "system") + | map(attribute="content") + | first + | default("Given a search query, retrieve relevant candidates that answer the query.") +}}:{{ + messages + | selectattr("role", "eq", "query") + | map(attribute="content") + | first +}} +:{{ + messages + | selectattr("role", "eq", "document") + | map(attribute="content") + | first +}}<|im_end|> +<|im_start|>assistant + diff --git a/examples/pooling/score/vision_language_reranker.py b/examples/pooling/score/vision_language_reranker.py new file mode 100644 index 000000000..657aced98 --- /dev/null +++ b/examples/pooling/score/vision_language_reranker.py @@ -0,0 +1,172 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +This example shows how to use vLLM for running offline inference with +vision language reranker models for multimodal scoring tasks. + +Vision language rerankers score the relevance between a text query and +multimodal documents (text + images/videos). +""" + +from argparse import Namespace +from collections.abc import Callable +from dataclasses import asdict +from pathlib import Path +from typing import NamedTuple + +from vllm import LLM, EngineArgs +from vllm.entrypoints.score_utils import ScoreMultiModalParam +from vllm.utils.argparse_utils import FlexibleArgumentParser + +TEMPLATE_HOME = Path(__file__).parent / "template" + + +class RerankModelData(NamedTuple): + engine_args: EngineArgs + chat_template: str | None = None + + +def run_jinavl_reranker(modality: str) -> RerankModelData: + assert modality == "image" + + engine_args = EngineArgs( + model="jinaai/jina-reranker-m0", + runner="pooling", + max_model_len=32768, + trust_remote_code=True, + mm_processor_kwargs={ + "min_pixels": 3136, + "max_pixels": 602112, + }, + limit_mm_per_prompt={modality: 1}, + ) + return RerankModelData( + engine_args=engine_args, + ) + + +def run_qwen3_vl_reranker(modality: str) -> RerankModelData: + engine_args = EngineArgs( + model="Qwen/Qwen3-VL-Reranker-2B", + runner="pooling", + max_model_len=16384, + limit_mm_per_prompt={modality: 1}, + # HuggingFace model configuration overrides required for compatibility + hf_overrides={ + # Manually route to sequence classification architecture + # This tells vLLM to use Qwen3VLForSequenceClassification instead of + # the default Qwen3VLForConditionalGeneration + "architectures": ["Qwen3VLForSequenceClassification"], + # Specify which token logits to extract from the language model head + # The original reranker uses "no" and "yes" token logits for scoring + "classifier_from_token": ["no", "yes"], + # Enable special handling for original Qwen3-Reranker models + # This flag triggers conversion logic that transforms the two token + # vectors into a single classification vector + "is_original_qwen3_reranker": True, + }, + ) + chat_template_path = "qwen3_vl_reranker.jinja" + chat_template = (TEMPLATE_HOME / chat_template_path).read_text() + return RerankModelData( + engine_args=engine_args, + chat_template=chat_template, + ) + + +model_example_map: dict[str, Callable[[str], RerankModelData]] = { + "jinavl_reranker": run_jinavl_reranker, + "qwen3_vl_reranker": run_qwen3_vl_reranker, +} + + +def parse_args(): + parser = FlexibleArgumentParser( + description="Demo on using vLLM for offline inference with " + "vision language reranker models for multimodal scoring tasks." + ) + parser.add_argument( + "--model-name", + "-m", + type=str, + default="jinavl_reranker", + choices=model_example_map.keys(), + help="The name of the reranker model.", + ) + parser.add_argument( + "--modality", + type=str, + default="image", + choices=["image", "video"], + help="Modality of the multimodal input (image or video).", + ) + return parser.parse_args() + + +def get_multi_modal_input(modality: str) -> tuple[str, ScoreMultiModalParam]: + # Sample query for testing the reranker + if modality == "image": + query = "A woman playing with her dog on a beach at sunset." + # Sample multimodal documents to be scored against the query + # Each document contains an image URL that will be fetched and processed + documents: ScoreMultiModalParam = { + "content": [ + { + "type": "text", + "text": ( + "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, " # noqa: E501 + "as the dog offers its paw in a heartwarming display of companionship and trust." # noqa: E501 + ), + }, + { + "type": "image_url", + "image_url": { + "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" + }, + }, + ] + } + elif modality == "video": + query = "A girl is drawing pictures on an ipad." + # Sample video documents to be scored against the query + documents: ScoreMultiModalParam = { + "content": [ + { + "type": "text", + "text": "A girl is drawing a guitar on her ipad with Apple Pencil.", + }, + { + "type": "video_url", + "video_url": { + "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4" + }, + }, + ] + } + else: + raise ValueError(f"Unsupported modality: {modality}") + return query, documents + + +def main(args: Namespace): + # Run the selected reranker model + modality = args.modality + model_request = model_example_map[args.model_name](modality) + engine_args = model_request.engine_args + + llm = LLM(**asdict(engine_args)) + + query, documents = get_multi_modal_input(modality) + outputs = llm.score(query, documents, chat_template=model_request.chat_template) + + print("-" * 50) + print(f"Model: {engine_args.model}") + print(f"Modality: {modality}") + print(f"Query: {query}") + print("Relevance scores:", [output.outputs.score for output in outputs]) + print("-" * 50) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/tests/models/registry.py b/tests/models/registry.py index 570bcc734..9778678b3 100644 --- a/tests/models/registry.py +++ b/tests/models/registry.py @@ -612,6 +612,15 @@ _AUTOMATIC_CONVERTED_MODELS = { "tomaarsen/Qwen3-Reranker-0.6B-seq-cls" ), "Qwen3ForTokenClassification": _HfExamplesInfo("bd2lcco/Qwen3-0.6B-finetuned"), + "Qwen3VLForSequenceClassification": _HfExamplesInfo( + "Qwen/Qwen3-VL-Reranker-2B", + is_available_online=False, + hf_overrides={ + "architectures": ["Qwen3VLForSequenceClassification"], + "classifier_from_token": ["no", "yes"], + "is_original_qwen3_reranker": True, + }, + ), } _MULTIMODAL_EXAMPLE_MODELS = { diff --git a/vllm/entrypoints/score_utils.py b/vllm/entrypoints/score_utils.py index a3837d9d3..09ef8781b 100644 --- a/vllm/entrypoints/score_utils.py +++ b/vllm/entrypoints/score_utils.py @@ -11,6 +11,7 @@ from vllm.entrypoints.chat_utils import ( ChatCompletionContentPartImageEmbedsParam, ChatCompletionContentPartImageParam, ChatCompletionContentPartTextParam, + ChatCompletionContentPartVideoParam, ChatTemplateResolutionError, MultiModalItemTracker, _ContentPart, @@ -27,6 +28,7 @@ ScoreContentPartParam: TypeAlias = ( ChatCompletionContentPartImageParam | ChatCompletionContentPartImageEmbedsParam | ChatCompletionContentPartTextParam + | ChatCompletionContentPartVideoParam ) diff --git a/vllm/model_executor/models/adapters.py b/vllm/model_executor/models/adapters.py index 43303aa76..07fa72561 100644 --- a/vllm/model_executor/models/adapters.py +++ b/vllm/model_executor/models/adapters.py @@ -333,9 +333,14 @@ def as_seq_cls_model(cls: _T) -> _T: ) def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]): - text_config = self.config.get_text_config() - tokens = getattr(text_config, "classifier_from_token", None) - method = getattr(text_config, "method", None) + hf_config = self.config + text_config = hf_config.get_text_config() + tokens = getattr( + hf_config, + "classifier_from_token", + getattr(text_config, "classifier_from_token", None), + ) + method = getattr(hf_config, "method", getattr(text_config, "method", None)) def auto_set_score_bias(weights): for name, weight in weights: @@ -366,9 +371,14 @@ def as_seq_cls_model(cls: _T) -> _T: class SequenceClassificationConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_config(vllm_config: "VllmConfig") -> None: - text_config = vllm_config.model_config.hf_config.get_text_config() - method = getattr(text_config, "method", None) - tokens = getattr(text_config, "classifier_from_token", None) + hf_config = vllm_config.model_config.hf_config + text_config = hf_config.get_text_config() + method = getattr(hf_config, "method", getattr(text_config, "method", None)) + tokens = getattr( + hf_config, + "classifier_from_token", + getattr(text_config, "classifier_from_token", None), + ) if method is None: return @@ -378,8 +388,10 @@ class SequenceClassificationConfig(VerifyAndUpdateConfig): if method == "from_2_way_softmax": assert len(tokens) == 2 + hf_config.num_labels = 1 text_config.num_labels = 1 else: + hf_config.num_labels = len(tokens) text_config.num_labels = len(tokens) # `llm as reranker` defaults to not using separating token. @@ -396,9 +408,14 @@ def load_weights_using_from_2_way_softmax( model_config = model.vllm_config.model_config quant_config = model.vllm_config.quant_config - text_config = model.config.get_text_config() + hf_config = model.config + text_config = hf_config.get_text_config() - tokens = getattr(text_config, "classifier_from_token", []) + tokens = getattr( + hf_config, + "classifier_from_token", + getattr(text_config, "classifier_from_token", []), + ) tokens = cast(list[int], tokens) assert len(tokens) == 2 @@ -409,10 +426,15 @@ def load_weights_using_from_2_way_softmax( # embed_tokens is the assumed name for input embeddings. If the model does not # have this attribute, we fall back to get_input_embeddings(), which is used by # the Transformers modeling backend. + text_backbone = ( + model.get_language_model().model + if hasattr(model, "get_language_model") + else model.model + ) embed_tokens = ( - model.model.embed_tokens - if hasattr(model.model, "embed_tokens") - else model.model.get_input_embeddings() + text_backbone.embed_tokens + if hasattr(text_backbone, "embed_tokens") + else text_backbone.get_input_embeddings() ) model.lm_head = model.lm_head.tie_weights(embed_tokens) @@ -516,8 +538,9 @@ def seq_cls_model_loader(model, weights: Iterable[tuple[str, torch.Tensor]]): # - GemmaForCausalLM # - bge-reranker-v2-gemma - text_config = model.vllm_config.model_config.hf_config.get_text_config() - method = getattr(text_config, "method", None) + hf_config = model.vllm_config.model_config.hf_config + text_config = hf_config.get_text_config() + method = getattr(hf_config, "method", getattr(text_config, "method", None)) assert method in SEQ_CLS_LOAD_METHODS, f"method {method} not supported" return SEQ_CLS_LOAD_METHODS[method](model, weights) diff --git a/vllm/model_executor/models/config.py b/vllm/model_executor/models/config.py index 362c194d8..9ef038d84 100644 --- a/vllm/model_executor/models/config.py +++ b/vllm/model_executor/models/config.py @@ -256,6 +256,10 @@ class Qwen3ForSequenceClassificationConfig(VerifyAndUpdateConfig): model_config.hf_config.method = "from_2_way_softmax" +class Qwen3VLForSequenceClassificationConfig(Qwen3ForSequenceClassificationConfig): + pass + + class JinaVLForSequenceClassificationConfig(VerifyAndUpdateConfig): @staticmethod def verify_and_update_model_config(model_config: "ModelConfig") -> None: @@ -551,6 +555,7 @@ MODELS_CONFIG_MAP: dict[str, type[VerifyAndUpdateConfig]] = { "Qwen2ForProcessRewardModel": Qwen2ForProcessRewardModelConfig, "Qwen2ForRewardModel": Qwen2ForRewardModelConfig, "Qwen3ForSequenceClassification": Qwen3ForSequenceClassificationConfig, + "Qwen3VLForSequenceClassification": Qwen3VLForSequenceClassificationConfig, "XLMRobertaModel": JinaRobertaModelConfig, "JinaVLForRanking": JinaVLForSequenceClassificationConfig, "JambaForSequenceClassification": JambaForSequenceClassificationConfig,