From 22b64948f6f4381bce7ac8ec0487020f0129e1cb Mon Sep 17 00:00:00 2001 From: "wang.yuqi" Date: Mon, 9 Feb 2026 14:42:38 +0800 Subject: [PATCH] [Frontend][last/5] Make pooling entrypoints request schema consensus. (#31127) Signed-off-by: wang.yuqi --- .buildkite/test-amd.yaml | 2 +- .buildkite/test-pipeline.yaml | 2 +- .buildkite/test_areas/misc.yaml | 2 +- docs/features/multimodal_inputs.md | 2 +- docs/serving/openai_compatible_server.md | 60 +-- .../classify/vision_classification_online.py | 110 +++++ .../embed/template/dse_qwen2_vl.jinja} | 0 .../embed/template/vlm2vec_phi3v.jinja} | 0 .../embed/template/vlm2vec_qwen2vl.jinja} | 0 .../pooling/embed/vision_embedding_offline.py | 249 +++++++++- .../pooling/embed/vision_embedding_online.py | 11 +- .../pooling/vision_language_pooling.py | 441 ------------------ .../pooling/score/vision_rerank_api_online.py | 14 + .../pooling/score/vision_reranker_offline.py | 159 ++++--- .../pooling/score/vision_score_api_online.py | 14 + .../pooling/classify/test_offline.py | 57 ++- .../pooling/embed/test_online_vision.py | 83 +++- tests/renderers/test_hf.py | 6 +- vllm/entrypoints/pooling/base/protocol.py | 15 + vllm/entrypoints/pooling/classify/protocol.py | 8 +- vllm/entrypoints/pooling/embed/protocol.py | 7 +- vllm/entrypoints/pooling/pooling/protocol.py | 7 +- vllm/entrypoints/pooling/score/protocol.py | 16 +- vllm/utils/print_utils.py | 7 + 24 files changed, 659 insertions(+), 613 deletions(-) create mode 100644 examples/pooling/classify/vision_classification_online.py rename examples/{template_dse_qwen2_vl.jinja => pooling/embed/template/dse_qwen2_vl.jinja} (100%) rename examples/{template_vlm2vec_phi3v.jinja => pooling/embed/template/vlm2vec_phi3v.jinja} (100%) rename examples/{template_vlm2vec_qwen2vl.jinja => pooling/embed/template/vlm2vec_qwen2vl.jinja} (100%) delete mode 100644 examples/pooling/pooling/vision_language_pooling.py create mode 100644 vllm/utils/print_utils.py diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml index 587bc4443..e78cdd7f8 100644 --- a/.buildkite/test-amd.yaml +++ b/.buildkite/test-amd.yaml @@ -514,7 +514,7 @@ steps: - python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 # for pooling models - - python3 pooling/pooling/vision_language_pooling.py --seed 0 + - python3 pooling/embed/vision_embedding_offline.py --seed 0 # for features demo - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml index 1e28f520d..73d4cf80c 100644 --- a/.buildkite/test-pipeline.yaml +++ b/.buildkite/test-pipeline.yaml @@ -453,7 +453,7 @@ steps: - python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 # for pooling models - - python3 pooling/pooling/vision_language_pooling.py --seed 0 + - python3 pooling/embed/vision_embedding_offline.py --seed 0 # for features demo - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py diff --git a/.buildkite/test_areas/misc.yaml b/.buildkite/test_areas/misc.yaml index a01c2296f..1e9318796 100644 --- a/.buildkite/test_areas/misc.yaml +++ b/.buildkite/test_areas/misc.yaml @@ -72,7 +72,7 @@ steps: - python3 offline_inference/vision_language_multi_image.py --seed 0 - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0 # for pooling models - - python3 pooling/pooling/vision_language_pooling.py --seed 0 + - python3 pooling/embed/vision_embedding_offline.py --seed 0 # for features demo - python3 offline_inference/prefix_caching.py - python3 offline_inference/llm_engine_example.py diff --git a/docs/features/multimodal_inputs.md b/docs/features/multimodal_inputs.md index 264fd8c48..3c1028929 100644 --- a/docs/features/multimodal_inputs.md +++ b/docs/features/multimodal_inputs.md @@ -510,7 +510,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument. For certain models, we provide alternative chat templates inside [examples](../../examples). - For example, VLM2Vec uses [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision. + For example, VLM2Vec uses [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../examples/pooling/embed/template/vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision. ### Image Inputs diff --git a/docs/serving/openai_compatible_server.md b/docs/serving/openai_compatible_server.md index bdd1784c8..97ed7d45f 100644 --- a/docs/serving/openai_compatible_server.md +++ b/docs/serving/openai_compatible_server.md @@ -311,7 +311,7 @@ and passing a list of `messages` in the request. Refer to the examples below for vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \ --trust-remote-code \ --max-model-len 4096 \ - --chat-template examples/template_vlm2vec_phi3v.jinja + --chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja ``` !!! important @@ -319,7 +319,7 @@ and passing a list of `messages` in the request. Refer to the examples below for to run this model in embedding mode instead of text generation mode. The custom chat template is completely different from the original one for this model, - and can be found here: [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja) + and can be found here: [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../examples/pooling/embed/template/vlm2vec_phi3v.jinja) Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library: @@ -359,14 +359,14 @@ and passing a list of `messages` in the request. Refer to the examples below for vllm serve MrLight/dse-qwen2-2b-mrl-v1 --runner pooling \ --trust-remote-code \ --max-model-len 8192 \ - --chat-template examples/template_dse_qwen2_vl.jinja + --chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja ``` !!! important Like with VLM2Vec, we have to explicitly pass `--runner pooling`. Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled - by a custom chat template: [examples/template_dse_qwen2_vl.jinja](../../examples/template_dse_qwen2_vl.jinja) + by a custom chat template: [examples/pooling/embed/template/dse_qwen2_vl.jinja](../../examples/pooling/embed/template/dse_qwen2_vl.jinja) !!! important `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code @@ -532,7 +532,7 @@ The following [sampling parameters](../api/README.md#inference-parameters) are s ??? code ```python - --8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params" + --8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:transcription-sampling-params" ``` The following extra parameters are supported: @@ -540,7 +540,7 @@ The following extra parameters are supported: ??? code ```python - --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params" + --8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:transcription-extra-params" ``` ### Translations API @@ -560,13 +560,13 @@ Code example: [examples/online_serving/openai_translation_client.py](../../examp The following [sampling parameters](../api/README.md#inference-parameters) are supported. ```python ---8<-- "vllm/entrypoints/openai/protocol.py:translation-sampling-params" +--8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:translation-sampling-params" ``` The following extra parameters are supported: ```python ---8<-- "vllm/entrypoints/openai/protocol.py:translation-extra-params" +--8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:translation-extra-params" ``` ### Realtime API @@ -954,28 +954,34 @@ You can pass multi-modal inputs to scoring models by passing `content` including ```python import requests - + response = requests.post( "http://localhost:8000/v1/score", json={ "model": "jinaai/jina-reranker-m0", "queries": "slm markdown", - "documents": { - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" - }, - }, - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" - }, - }, - ], - }, + "documents": [ + { + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" + }, + } + ], + }, + { + "content": [ + { + "type": "image_url", + "image_url": { + "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" + }, + } + ] + }, + ], }, ) response.raise_for_status() @@ -1001,7 +1007,6 @@ The following Score API parameters are supported: ```python --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params" ---8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params" ``` The following extra parameters are supported: @@ -1009,7 +1014,6 @@ The following extra parameters are supported: ```python --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params" --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params" ---8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params" ``` ### Re-rank API @@ -1092,7 +1096,6 @@ The following Re-rank API parameters are supported: ```python --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params" --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params" ---8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params" ``` The following extra parameters are supported: @@ -1100,7 +1103,6 @@ The following extra parameters are supported: ```python --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params" --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params" ---8<-- "vllm/entrypoints/pooling/score/protocol.py:rerank-extra-params" ``` ## Ray Serve LLM diff --git a/examples/pooling/classify/vision_classification_online.py b/examples/pooling/classify/vision_classification_online.py new file mode 100644 index 000000000..64dc5d4ae --- /dev/null +++ b/examples/pooling/classify/vision_classification_online.py @@ -0,0 +1,110 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# ruff: noqa: E501 +"""Example Python client for multimodal classification API using vLLM API server +NOTE: + start a supported multimodal classification model server with `vllm serve`, e.g. + vllm serve muziyongshixin/Qwen2.5-VL-7B-for-VideoCls \ + --runner pooling \ + --max-model-len 5000 \ + --limit-mm-per-prompt '{"video": 1}' \ + --hf-overrides '{"text_config": {"architectures": ["Qwen2_5_VLForSequenceClassification"]}}' +""" + +import argparse +import pprint + +import requests + +from vllm.multimodal.utils import encode_image_url, fetch_image + +input_text = "This product was excellent and exceeded my expectations" +image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg" +image_base64 = {"url": encode_image_url(fetch_image(image_url))} +video_url = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4" + + +def parse_args(): + parse = argparse.ArgumentParser() + parse.add_argument("--host", type=str, default="localhost") + parse.add_argument("--port", type=int, default=8000) + return parse.parse_args() + + +def main(args): + base_url = f"http://{args.host}:{args.port}" + models_url = base_url + "/v1/models" + classify_url = base_url + "/classify" + + response = requests.get(models_url) + model_name = response.json()["data"][0]["id"] + + print("Text classification output:") + messages = [ + { + "role": "assistant", + "content": "Please classify this text request.", + }, + { + "role": "user", + "content": input_text, + }, + ] + response = requests.post( + classify_url, + json={"model": model_name, "messages": messages}, + ) + pprint.pprint(response.json()) + + print("Image url classification output:") + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Please classify this image."}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + } + ] + response = requests.post( + classify_url, + json={"model": model_name, "messages": messages}, + ) + pprint.pprint(response.json()) + + print("Image base64 classification output:") + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Please classify this image."}, + {"type": "image_url", "image_url": image_base64}, + ], + } + ] + response = requests.post( + classify_url, + json={"model": model_name, "messages": messages}, + ) + pprint.pprint(response.json()) + + print("Video url classification output:") + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Please classify this video."}, + {"type": "video_url", "video_url": {"url": video_url}}, + ], + } + ] + response = requests.post( + classify_url, + json={"model": model_name, "messages": messages}, + ) + pprint.pprint(response.json()) + + +if __name__ == "__main__": + args = parse_args() + main(args) diff --git a/examples/template_dse_qwen2_vl.jinja b/examples/pooling/embed/template/dse_qwen2_vl.jinja similarity index 100% rename from examples/template_dse_qwen2_vl.jinja rename to examples/pooling/embed/template/dse_qwen2_vl.jinja diff --git a/examples/template_vlm2vec_phi3v.jinja b/examples/pooling/embed/template/vlm2vec_phi3v.jinja similarity index 100% rename from examples/template_vlm2vec_phi3v.jinja rename to examples/pooling/embed/template/vlm2vec_phi3v.jinja diff --git a/examples/template_vlm2vec_qwen2vl.jinja b/examples/pooling/embed/template/vlm2vec_qwen2vl.jinja similarity index 100% rename from examples/template_vlm2vec_qwen2vl.jinja rename to examples/pooling/embed/template/vlm2vec_qwen2vl.jinja diff --git a/examples/pooling/embed/vision_embedding_offline.py b/examples/pooling/embed/vision_embedding_offline.py index cfce047dc..a5f0d35af 100644 --- a/examples/pooling/embed/vision_embedding_offline.py +++ b/examples/pooling/embed/vision_embedding_offline.py @@ -11,23 +11,79 @@ on HuggingFace model repository. import argparse from dataclasses import asdict +from pathlib import Path from PIL.Image import Image from vllm import LLM, EngineArgs from vllm.multimodal.utils import fetch_image +from vllm.utils.print_utils import print_embeddings + +ROOT_DIR = Path(__file__).parent.parent.parent +EMBED_TEMPLATE_DIR = ROOT_DIR / "pooling/embed/template/" image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg" text = "A cat standing in the snow." multi_modal_data = {"image": fetch_image(image_url)} -def print_embeddings(embeds: list[float]): - embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds - print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})") +def run_clip(seed: int): + engine_args = EngineArgs( + model="openai/clip-vit-base-patch32", + runner="pooling", + limit_mm_per_prompt={"image": 1}, + ) + + llm = LLM(**asdict(engine_args) | {"seed": seed}) + + print("Text embedding output:") + outputs = llm.embed(text, use_tqdm=False) + print_embeddings(outputs[0].outputs.embedding) + + print("Image embedding output:") + prompt = "" # For image input, make sure that the prompt text is empty + outputs = llm.embed( + { + "prompt": prompt, + "multi_modal_data": multi_modal_data, + }, + use_tqdm=False, + ) + print_embeddings(outputs[0].outputs.embedding) -def run_qwen3_vl(): +def run_e5_v(seed: int): + engine_args = EngineArgs( + model="royokong/e5-v", + runner="pooling", + max_model_len=4096, + limit_mm_per_prompt={"image": 1}, + ) + + llm = LLM(**asdict(engine_args) | {"seed": seed}) + + llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501 + + print("Text embedding output:") + prompt_text = llama3_template.format( + f"{text}\nSummary above sentence in one word: " + ) + outputs = llm.embed(prompt_text, use_tqdm=False) + print_embeddings(outputs[0].outputs.embedding) + + print("Image embedding output:") + prompt_image = llama3_template.format("\nSummary above image in one word: ") + outputs = llm.embed( + { + "prompt": prompt_image, + "multi_modal_data": multi_modal_data, + }, + use_tqdm=False, + ) + print_embeddings(outputs[0].outputs.embedding) + + +def run_qwen3_vl(seed: int): try: from qwen_vl_utils import smart_resize except ModuleNotFoundError: @@ -61,20 +117,20 @@ def run_qwen3_vl(): ) default_instruction = "Represent the user's input." image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>" - text_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n" - image_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n" - image_text_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n" + prompt_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n" + prompt_image = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n" + prompt_image_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n" - llm = LLM(**asdict(engine_args)) + llm = LLM(**asdict(engine_args) | {"seed": seed}) print("Text embedding output:") - outputs = llm.embed(text_prompt, use_tqdm=False) + outputs = llm.embed(prompt_text, use_tqdm=False) print_embeddings(outputs[0].outputs.embedding) print("Image embedding output:") outputs = llm.embed( { - "prompt": image_prompt, + "prompt": prompt_image, "multi_modal_data": multi_modal_data, }, use_tqdm=False, @@ -84,7 +140,162 @@ def run_qwen3_vl(): print("Image+Text embedding output:") outputs = llm.embed( { - "prompt": image_text_prompt, + "prompt": prompt_image_text, + "multi_modal_data": multi_modal_data, + }, + use_tqdm=False, + ) + print_embeddings(outputs[0].outputs.embedding) + + +def run_siglip(seed: int): + engine_args = EngineArgs( + model="google/siglip-base-patch16-224", + runner="pooling", + limit_mm_per_prompt={"image": 1}, + ) + + llm = LLM(**asdict(engine_args) | {"seed": seed}) + + print("Text embedding output:") + outputs = llm.embed(text, use_tqdm=False) + print_embeddings(outputs[0].outputs.embedding) + + print("Image embedding output:") + prompt = "" # For image input, make sure that the prompt text is empty + outputs = llm.embed( + { + "prompt": prompt, + "multi_modal_data": multi_modal_data, + }, + use_tqdm=False, + ) + print_embeddings(outputs[0].outputs.embedding) + + +def run_vlm2vec_phi3v(seed: int): + engine_args = EngineArgs( + model="TIGER-Lab/VLM2Vec-Full", + runner="pooling", + max_model_len=4096, + trust_remote_code=True, + mm_processor_kwargs={"num_crops": 4}, + limit_mm_per_prompt={"image": 1}, + ) + + llm = LLM(**asdict(engine_args) | {"seed": seed}) + image_token = "<|image_1|>" + + print("Text embedding output:") + prompt_text = f"Find me an everyday image that matches the given caption: {text}" + outputs = llm.embed(prompt_text, use_tqdm=False) + print_embeddings(outputs[0].outputs.embedding) + + print("Image embedding output:") + prompt_image = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501 + outputs = llm.embed( + { + "prompt": prompt_image, + "multi_modal_data": multi_modal_data, + }, + use_tqdm=False, + ) + print_embeddings(outputs[0].outputs.embedding) + + print("Image+Text embedding output:") + prompt_image_text = ( + f"{image_token} Represent the given image with the following question: {text}" # noqa: E501 + ) + outputs = llm.embed( + { + "prompt": prompt_image_text, + "multi_modal_data": multi_modal_data, + }, + use_tqdm=False, + ) + print_embeddings(outputs[0].outputs.embedding) + + +def run_vlm2vec_qwen2vl(seed: int): + # vLLM does not support LoRA adapters on multi-modal encoder, + # so we merge the weights first + from huggingface_hub.constants import HF_HUB_CACHE + from peft import PeftConfig, PeftModel + from transformers import AutoModelForImageTextToText, AutoProcessor + + from vllm.entrypoints.chat_utils import load_chat_template + + model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B" + + base_model = AutoModelForImageTextToText.from_pretrained(model_id) + lora_model = PeftModel.from_pretrained( + base_model, + model_id, + config=PeftConfig.from_pretrained(model_id), + ) + model = lora_model.merge_and_unload().to(dtype=base_model.dtype) + model._hf_peft_config_loaded = False # Needed to save the merged model + + processor = AutoProcessor.from_pretrained( + model_id, + # `min_pixels` and `max_pixels` are deprecated for + # transformers `preprocessor_config.json` + size={"shortest_edge": 3136, "longest_edge": 12845056}, + ) + processor.chat_template = load_chat_template( + # The original chat template is not correct + EMBED_TEMPLATE_DIR / "vlm2vec_qwen2vl.jinja", + ) + + merged_path = str( + Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm") + ) + print(f"Saving merged model to {merged_path}...") + print( + "NOTE: This directory is not tracked by `huggingface_hub` " + "so you have to delete this manually if you don't want it anymore." + ) + model.save_pretrained(merged_path) + processor.save_pretrained(merged_path) + print("Done!") + + engine_args = EngineArgs( + model=merged_path, + runner="pooling", + max_model_len=4096, + mm_processor_kwargs={ + "min_pixels": 3136, + "max_pixels": 12845056, + }, + limit_mm_per_prompt={"image": 1}, + ) + + llm = LLM(**asdict(engine_args) | {"seed": seed}) + image_token = "<|image_pad|>" + + print("Text embedding output:") + prompt_text = f"Find me an everyday image that matches the given caption: {text}" + outputs = llm.embed(prompt_text, use_tqdm=False) + print_embeddings(outputs[0].outputs.embedding) + + print("Image embedding output:") + prompt_image = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501 + outputs = llm.embed( + { + "prompt": prompt_image, + "multi_modal_data": multi_modal_data, + }, + use_tqdm=False, + ) + print_embeddings(outputs[0].outputs.embedding) + + print("Image+Text embedding output:") + prompt_image_text = ( + f"{image_token} Represent the given image with the following question: {text}" # noqa: E501 + ) + outputs = llm.embed( + { + "prompt": prompt_image_text, "multi_modal_data": multi_modal_data, }, use_tqdm=False, @@ -93,7 +304,12 @@ def run_qwen3_vl(): model_example_map = { + "clip": run_clip, + "e5_v": run_e5_v, "qwen3_vl": run_qwen3_vl, + "siglip": run_siglip, + "vlm2vec_phi3v": run_vlm2vec_phi3v, + "vlm2vec_qwen2vl": run_vlm2vec_qwen2vl, } @@ -103,16 +319,23 @@ def parse_args(): ) parser.add_argument( "--model", + "-m", type=str, + default="vlm2vec_phi3v", choices=model_example_map.keys(), - required=True, help="The name of the embedding model.", ) + parser.add_argument( + "--seed", + type=int, + default=0, + help="Set the seed when initializing `vllm.LLM`.", + ) return parser.parse_args() def main(args): - model_example_map[args.model]() + model_example_map[args.model](args.seed) if __name__ == "__main__": diff --git a/examples/pooling/embed/vision_embedding_online.py b/examples/pooling/embed/vision_embedding_online.py index 66c824739..522ce1fcb 100644 --- a/examples/pooling/embed/vision_embedding_online.py +++ b/examples/pooling/embed/vision_embedding_online.py @@ -17,6 +17,8 @@ from openai.types.chat import ChatCompletionMessageParam from openai.types.create_embedding_response import CreateEmbeddingResponse from PIL import Image +from vllm.utils.print_utils import print_embeddings + # Modify OpenAI's API key and API base to use vLLM's API server. openai_api_key = "EMPTY" openai_api_base = "http://localhost:8000/v1" @@ -51,11 +53,6 @@ def create_chat_embeddings( ) -def print_embeddings(embeds): - embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds - print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})") - - def run_clip(client: OpenAI, model: str): """ Start the server using: @@ -105,7 +102,7 @@ def run_dse_qwen2_vl(client: OpenAI, model: str): --runner pooling \ --trust-remote-code \ --max-model-len 8192 \ - --chat-template examples/template_dse_qwen2_vl.jinja + --chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja """ response = create_chat_embeddings( client, @@ -316,7 +313,7 @@ def run_vlm2vec(client: OpenAI, model: str): --runner pooling \ --trust-remote-code \ --max-model-len 4096 \ - --chat-template examples/template_vlm2vec_phi3v.jinja + --chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja """ response = create_chat_embeddings( diff --git a/examples/pooling/pooling/vision_language_pooling.py b/examples/pooling/pooling/vision_language_pooling.py deleted file mode 100644 index 3954cf88d..000000000 --- a/examples/pooling/pooling/vision_language_pooling.py +++ /dev/null @@ -1,441 +0,0 @@ -# SPDX-License-Identifier: Apache-2.0 -# SPDX-FileCopyrightText: Copyright contributors to the vLLM project -""" -This example shows how to use vLLM for running offline inference with -the correct prompt format on vision language models for multimodal pooling. - -For most models, the prompt format should follow corresponding examples -on HuggingFace model repository. -""" - -from argparse import Namespace -from dataclasses import asdict -from pathlib import Path -from typing import Literal, NamedTuple, TypeAlias, TypedDict, get_args - -from PIL.Image import Image - -from vllm import LLM, EngineArgs -from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam -from vllm.multimodal.utils import fetch_image -from vllm.utils.argparse_utils import FlexibleArgumentParser - -ROOT_DIR = Path(__file__).parent.parent.parent -EXAMPLES_DIR = ROOT_DIR / "examples" - - -class TextQuery(TypedDict): - modality: Literal["text"] - text: str - - -class ImageQuery(TypedDict): - modality: Literal["image"] - image: Image - - -class TextImageQuery(TypedDict): - modality: Literal["text+image"] - text: str - image: Image - - -class TextImagesQuery(TypedDict): - modality: Literal["text+images"] - text: str - image: ScoreMultiModalParam - - -QueryModality = Literal["text", "image", "text+image", "text+images"] -Query: TypeAlias = TextQuery | ImageQuery | TextImageQuery | TextImagesQuery - - -class ModelRequestData(NamedTuple): - engine_args: EngineArgs - prompt: str | None = None - image: Image | None = None - query: str | None = None - documents: ScoreMultiModalParam | None = None - - -def run_clip(query: Query) -> ModelRequestData: - if query["modality"] == "text": - prompt = query["text"] - image = None - elif query["modality"] == "image": - prompt = "" # For image input, make sure that the prompt text is empty - image = query["image"] - else: - modality = query["modality"] - raise ValueError(f"Unsupported query modality: '{modality}'") - - engine_args = EngineArgs( - model="openai/clip-vit-base-patch32", - runner="pooling", - limit_mm_per_prompt={"image": 1}, - ) - - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image=image, - ) - - -def run_e5_v(query: Query) -> ModelRequestData: - llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501 - - if query["modality"] == "text": - text = query["text"] - prompt = llama3_template.format(f"{text}\nSummary above sentence in one word: ") - image = None - elif query["modality"] == "image": - prompt = llama3_template.format("\nSummary above image in one word: ") - image = query["image"] - else: - modality = query["modality"] - raise ValueError(f"Unsupported query modality: '{modality}'") - - engine_args = EngineArgs( - model="royokong/e5-v", - runner="pooling", - max_model_len=4096, - limit_mm_per_prompt={"image": 1}, - ) - - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image=image, - ) - - -def run_jinavl_reranker(query: Query) -> ModelRequestData: - if query["modality"] != "text+images": - raise ValueError(f"Unsupported query modality: '{query['modality']}'") - - engine_args = EngineArgs( - model="jinaai/jina-reranker-m0", - runner="pooling", - max_model_len=32768, - trust_remote_code=True, - mm_processor_kwargs={ - "min_pixels": 3136, - "max_pixels": 602112, - }, - limit_mm_per_prompt={"image": 1}, - ) - - return ModelRequestData( - engine_args=engine_args, - query=query["text"], - documents=query["image"], - ) - - -def run_qwen3_vl(query: Query) -> ModelRequestData: - image_placeholder = "<|image_pad|>" - if query["modality"] == "text": - prompt = query["text"] - image = None - elif query["modality"] == "image": - prompt = image_placeholder - image = query["image"] - elif query["modality"] == "text+image": - text = query["text"] - prompt = f"{image_placeholder}\n{text}" - image = query["image"] - else: - modality = query["modality"] - raise ValueError(f"Unsupported query modality: '{modality}'") - - engine_args = EngineArgs( - model="Qwen/Qwen3-VL-Embedding-2B", - runner="pooling", - max_model_len=8192, - limit_mm_per_prompt={"image": 1}, - ) - - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image=image, - ) - - -def run_siglip(query: Query) -> ModelRequestData: - if query["modality"] == "text": - prompt = query["text"] - image = None - elif query["modality"] == "image": - prompt = "" # For image input, make sure that the prompt text is empty - image = query["image"] - else: - modality = query["modality"] - raise ValueError(f"Unsupported query modality: '{modality}'") - - engine_args = EngineArgs( - model="google/siglip-base-patch16-224", - runner="pooling", - limit_mm_per_prompt={"image": 1}, - ) - - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image=image, - ) - - -def _get_vlm2vec_prompt_image(query: Query, image_token: str): - if query["modality"] == "text": - text = query["text"] - prompt = f"Find me an everyday image that matches the given caption: {text}" - image = None - elif query["modality"] == "image": - prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501 - image = query["image"] - elif query["modality"] == "text+image": - text = query["text"] - prompt = f"{image_token} Represent the given image with the following question: {text}" # noqa: E501 - image = query["image"] - else: - modality = query["modality"] - raise ValueError(f"Unsupported query modality: {modality!r}") - - return prompt, image - - -def run_vlm2vec_phi3v(query: Query) -> ModelRequestData: - prompt, image = _get_vlm2vec_prompt_image(query, "<|image_1|>") - - engine_args = EngineArgs( - model="TIGER-Lab/VLM2Vec-Full", - runner="pooling", - max_model_len=4096, - trust_remote_code=True, - mm_processor_kwargs={"num_crops": 4}, - limit_mm_per_prompt={"image": 1}, - ) - - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image=image, - ) - - -def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData: - # vLLM does not support LoRA adapters on multi-modal encoder, - # so we merge the weights first - from huggingface_hub.constants import HF_HUB_CACHE - from peft import PeftConfig, PeftModel - from transformers import AutoModelForImageTextToText, AutoProcessor - - from vllm.entrypoints.chat_utils import load_chat_template - - model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B" - - base_model = AutoModelForImageTextToText.from_pretrained(model_id) - lora_model = PeftModel.from_pretrained( - base_model, - model_id, - config=PeftConfig.from_pretrained(model_id), - ) - model = lora_model.merge_and_unload().to(dtype=base_model.dtype) - model._hf_peft_config_loaded = False # Needed to save the merged model - - processor = AutoProcessor.from_pretrained( - model_id, - # `min_pixels` and `max_pixels` are deprecated for - # transformers `preprocessor_config.json` - size={"shortest_edge": 3136, "longest_edge": 12845056}, - ) - processor.chat_template = load_chat_template( - # The original chat template is not correct - EXAMPLES_DIR / "template_vlm2vec_qwen2vl.jinja", - ) - - merged_path = str( - Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm") - ) - print(f"Saving merged model to {merged_path}...") - print( - "NOTE: This directory is not tracked by `huggingface_hub` " - "so you have to delete this manually if you don't want it anymore." - ) - model.save_pretrained(merged_path) - processor.save_pretrained(merged_path) - print("Done!") - - prompt, image = _get_vlm2vec_prompt_image(query, "<|image_pad|>") - - engine_args = EngineArgs( - model=merged_path, - runner="pooling", - max_model_len=4096, - mm_processor_kwargs={ - "min_pixels": 3136, - "max_pixels": 12845056, - }, - limit_mm_per_prompt={"image": 1}, - ) - - return ModelRequestData( - engine_args=engine_args, - prompt=prompt, - image=image, - ) - - -def get_query(modality: QueryModality): - if modality == "text": - return TextQuery(modality="text", text="A dog sitting in the grass") - - if modality == "image": - return ImageQuery( - modality="image", - image=fetch_image( - "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/eskimo.jpg" # noqa: E501 - ), - ) - - if modality == "text+image": - return TextImageQuery( - modality="text+image", - text="A cat standing in the snow.", - image=fetch_image( - "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg" # noqa: E501 - ), - ) - - if modality == "text+images": - return TextImagesQuery( - modality="text+images", - text="slm markdown", - image={ - "content": [ - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png" - }, - }, - { - "type": "image_url", - "image_url": { - "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png" - }, - }, - ] - }, - ) - - msg = f"Modality {modality} is not supported." - raise ValueError(msg) - - -def run_encode(model: str, modality: QueryModality, seed: int): - query = get_query(modality) - req_data = model_example_map[model](query) - - # Disable other modalities to save memory - default_limits = {"image": 0, "video": 0, "audio": 0} - req_data.engine_args.limit_mm_per_prompt = default_limits | dict( - req_data.engine_args.limit_mm_per_prompt or {} - ) - - engine_args = asdict(req_data.engine_args) | {"seed": seed} - llm = LLM(**engine_args) - - mm_data = {} - if req_data.image is not None: - mm_data["image"] = req_data.image - - outputs = llm.embed( - { - "prompt": req_data.prompt, - "multi_modal_data": mm_data, - } - ) - - print("-" * 50) - for output in outputs: - print(output.outputs.embedding) - print("-" * 50) - - -def run_score(model: str, modality: QueryModality, seed: int): - query = get_query(modality) - req_data = model_example_map[model](query) - - engine_args = asdict(req_data.engine_args) | {"seed": seed} - llm = LLM(**engine_args) - - outputs = llm.score(req_data.query, req_data.documents) - - print("-" * 30) - print([output.outputs.score for output in outputs]) - print("-" * 30) - - -model_example_map = { - "clip": run_clip, - "e5_v": run_e5_v, - "jinavl_reranker": run_jinavl_reranker, - "qwen3_vl": run_qwen3_vl, - "siglip": run_siglip, - "vlm2vec_phi3v": run_vlm2vec_phi3v, - "vlm2vec_qwen2vl": run_vlm2vec_qwen2vl, -} - - -def parse_args(): - parser = FlexibleArgumentParser( - description="Demo on using vLLM for offline inference with " - "vision language models for multimodal pooling tasks." - ) - parser.add_argument( - "--model-name", - "-m", - type=str, - default="vlm2vec_phi3v", - choices=model_example_map.keys(), - help="The name of the embedding model.", - ) - parser.add_argument( - "--task", - "-t", - type=str, - default="embedding", - choices=["embedding", "scoring"], - help="The task type.", - ) - parser.add_argument( - "--modality", - type=str, - default="image", - choices=get_args(QueryModality), - help="Modality of the input.", - ) - parser.add_argument( - "--seed", - type=int, - default=0, - help="Set the seed when initializing `vllm.LLM`.", - ) - return parser.parse_args() - - -def main(args: Namespace): - if args.task == "embedding": - run_encode(args.model_name, args.modality, args.seed) - elif args.task == "scoring": - run_score(args.model_name, args.modality, args.seed) - else: - raise ValueError(f"Unsupported task: {args.task}") - - -if __name__ == "__main__": - args = parse_args() - main(args) diff --git a/examples/pooling/score/vision_rerank_api_online.py b/examples/pooling/score/vision_rerank_api_online.py index f5a7c1018..dce2efd1d 100644 --- a/examples/pooling/score/vision_rerank_api_online.py +++ b/examples/pooling/score/vision_rerank_api_online.py @@ -30,6 +30,7 @@ document = ( "as the dog offers its paw in a heartwarming display of companionship and trust." ) image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" +video_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4" documents = [ { "type": "text", @@ -43,6 +44,10 @@ documents = [ "type": "image_url", "image_url": {"url": encode_image_url(fetch_image(image_url))}, }, + { + "type": "video_url", + "video_url": {"url": video_url}, + }, ] @@ -89,6 +94,15 @@ def main(args): response = requests.post(rerank_url, json=prompt) pprint.pprint(response.json()) + print("Query: string & Document: video url") + prompt = { + "model": model, + "query": query, + "documents": {"content": [documents[3]]}, + } + response = requests.post(rerank_url, json=prompt) + pprint.pprint(response.json()) + print("Query: string & Document: text + image url") prompt = { "model": model, diff --git a/examples/pooling/score/vision_reranker_offline.py b/examples/pooling/score/vision_reranker_offline.py index 0b9207a2e..19bb98177 100644 --- a/examples/pooling/score/vision_reranker_offline.py +++ b/examples/pooling/score/vision_reranker_offline.py @@ -15,20 +15,47 @@ from pathlib import Path from typing import NamedTuple from vllm import LLM, EngineArgs -from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam +from vllm.multimodal.utils import encode_image_url, fetch_image from vllm.utils.argparse_utils import FlexibleArgumentParser TEMPLATE_HOME = Path(__file__).parent / "template" +query = "A woman playing with her dog on a beach at sunset." +document = ( + "A woman shares a joyful moment with her golden retriever on a sun-drenched " + "beach at sunset, as the dog offers its paw in a heartwarming display of " + "companionship and trust." +) +image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" +video_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4" +documents = [ + { + "type": "text", + "text": document, + }, + { + "type": "image_url", + "image_url": {"url": image_url}, + }, + { + "type": "image_url", + "image_url": {"url": encode_image_url(fetch_image(image_url))}, + }, + { + "type": "video_url", + "video_url": {"url": video_url}, + }, +] + + class RerankModelData(NamedTuple): engine_args: EngineArgs chat_template: str | None = None + modality: set[str] = {} -def run_jinavl_reranker(modality: str) -> RerankModelData: - assert modality == "image" - +def run_jinavl_reranker() -> RerankModelData: engine_args = EngineArgs( model="jinaai/jina-reranker-m0", runner="pooling", @@ -38,19 +65,15 @@ def run_jinavl_reranker(modality: str) -> RerankModelData: "min_pixels": 3136, "max_pixels": 602112, }, - limit_mm_per_prompt={modality: 1}, - ) - return RerankModelData( - engine_args=engine_args, ) + return RerankModelData(engine_args=engine_args, modality={"image"}) -def run_qwen3_vl_reranker(modality: str) -> RerankModelData: +def run_qwen3_vl_reranker() -> RerankModelData: engine_args = EngineArgs( model="Qwen/Qwen3-VL-Reranker-2B", runner="pooling", max_model_len=16384, - limit_mm_per_prompt={modality: 1}, # HuggingFace model configuration overrides required for compatibility hf_overrides={ # Manually route to sequence classification architecture @@ -71,10 +94,11 @@ def run_qwen3_vl_reranker(modality: str) -> RerankModelData: return RerankModelData( engine_args=engine_args, chat_template=chat_template, + modality={"image", "video"}, ) -model_example_map: dict[str, Callable[[str], RerankModelData]] = { +model_example_map: dict[str, Callable[[], RerankModelData]] = { "jinavl_reranker": run_jinavl_reranker, "qwen3_vl_reranker": run_qwen3_vl_reranker, } @@ -93,78 +117,67 @@ def parse_args(): choices=model_example_map.keys(), help="The name of the reranker model.", ) - parser.add_argument( - "--modality", - type=str, - default="image", - choices=["image", "video"], - help="Modality of the multimodal input (image or video).", - ) return parser.parse_args() -def get_multi_modal_input(modality: str) -> tuple[str, ScoreMultiModalParam]: - # Sample query for testing the reranker - if modality == "image": - query = "A woman playing with her dog on a beach at sunset." - # Sample multimodal documents to be scored against the query - # Each document contains an image URL that will be fetched and processed - documents: ScoreMultiModalParam = { - "content": [ - { - "type": "text", - "text": ( - "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, " # noqa: E501 - "as the dog offers its paw in a heartwarming display of companionship and trust." # noqa: E501 - ), - }, - { - "type": "image_url", - "image_url": { - "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" - }, - }, - ] - } - elif modality == "video": - query = "A girl is drawing pictures on an ipad." - # Sample video documents to be scored against the query - documents: ScoreMultiModalParam = { - "content": [ - { - "type": "text", - "text": "A girl is drawing a guitar on her ipad with Apple Pencil.", - }, - { - "type": "video_url", - "video_url": { - "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4" - }, - }, - ] - } - else: - raise ValueError(f"Unsupported modality: {modality}") - return query, documents - - def main(args: Namespace): # Run the selected reranker model - modality = args.modality - model_request = model_example_map[args.model_name](modality) + model_request = model_example_map[args.model_name]() engine_args = model_request.engine_args llm = LLM(**asdict(engine_args)) - query, documents = get_multi_modal_input(modality) - outputs = llm.score(query, documents, chat_template=model_request.chat_template) - - print("-" * 50) - print(f"Model: {engine_args.model}") - print(f"Modality: {modality}") - print(f"Query: {query}") + print("Query: string & Document: string") + outputs = llm.score(query, document) + print("Relevance scores:", [output.outputs.score for output in outputs]) + + print("Query: string & Document: text") + outputs = llm.score( + query, {"content": [documents[0]]}, chat_template=model_request.chat_template + ) + print("Relevance scores:", [output.outputs.score for output in outputs]) + + print("Query: string & Document: image url") + outputs = llm.score( + query, {"content": [documents[1]]}, chat_template=model_request.chat_template + ) + print("Relevance scores:", [output.outputs.score for output in outputs]) + + print("Query: string & Document: image base64") + outputs = llm.score( + query, {"content": [documents[2]]}, chat_template=model_request.chat_template + ) + print("Relevance scores:", [output.outputs.score for output in outputs]) + + if "video" in model_request.modality: + print("Query: string & Document: video url") + outputs = llm.score( + query, + {"content": [documents[3]]}, + chat_template=model_request.chat_template, + ) + print("Relevance scores:", [output.outputs.score for output in outputs]) + + print("Query: string & Document: text + image url") + outputs = llm.score( + query, + {"content": [documents[0], documents[1]]}, + chat_template=model_request.chat_template, + ) + print("Relevance scores:", [output.outputs.score for output in outputs]) + + print("Query: string & Document: list") + outputs = llm.score( + query, + [ + document, + {"content": [documents[0]]}, + {"content": [documents[1]]}, + {"content": [documents[0], documents[1]]}, + ], + chat_template=model_request.chat_template, + ) print("Relevance scores:", [output.outputs.score for output in outputs]) - print("-" * 50) if __name__ == "__main__": diff --git a/examples/pooling/score/vision_score_api_online.py b/examples/pooling/score/vision_score_api_online.py index 7942ddaed..543d4bfa2 100644 --- a/examples/pooling/score/vision_score_api_online.py +++ b/examples/pooling/score/vision_score_api_online.py @@ -29,6 +29,7 @@ document = ( "as the dog offers its paw in a heartwarming display of companionship and trust." ) image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" +video_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4" documents = [ { "type": "text", @@ -42,6 +43,10 @@ documents = [ "type": "image_url", "image_url": {"url": encode_image_url(fetch_image(image_url))}, }, + { + "type": "video_url", + "video_url": {"url": video_url}, + }, ] @@ -92,6 +97,15 @@ def main(args): response = requests.post(score_url, json=prompt) pprint.pprint(response.json()) + print("Query: string & Document: video url") + prompt = { + "model": model, + "queries": query, + "documents": {"content": [documents[3]]}, + } + response = requests.post(score_url, json=prompt) + pprint.pprint(response.json()) + print("Query: string & Document: text + image url") prompt = { "model": model, diff --git a/tests/entrypoints/pooling/classify/test_offline.py b/tests/entrypoints/pooling/classify/test_offline.py index a07fcd372..a02d07ab0 100644 --- a/tests/entrypoints/pooling/classify/test_offline.py +++ b/tests/entrypoints/pooling/classify/test_offline.py @@ -7,12 +7,15 @@ import pytest import torch from tests.models.utils import softmax -from vllm import LLM, PoolingParams +from vllm import LLM, ClassificationRequestOutput, PoolingParams, PoolingRequestOutput from vllm.distributed import cleanup_dist_env_and_memory +from vllm.tasks import PoolingTask MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach" -prompts = ["The chef prepared a delicious meal."] +prompt = "The chef prepared a delicious meal." +prompt_token_ids = [785, 29706, 10030, 264, 17923, 15145, 13] +num_labels = 2 @pytest.fixture(scope="module") @@ -35,11 +38,48 @@ def llm(): cleanup_dist_env_and_memory() +@pytest.mark.skip_global_cleanup +def test_str_prompts(llm: LLM): + outputs = llm.classify(prompt, use_tqdm=False) + assert len(outputs) == 1 + assert isinstance(outputs[0], ClassificationRequestOutput) + assert outputs[0].prompt_token_ids == prompt_token_ids + assert len(outputs[0].outputs.probs) == num_labels + + +@pytest.mark.skip_global_cleanup +def test_token_ids_prompts(llm: LLM): + outputs = llm.classify([prompt_token_ids], use_tqdm=False) + assert len(outputs) == 1 + assert isinstance(outputs[0], ClassificationRequestOutput) + assert outputs[0].prompt_token_ids == prompt_token_ids + assert len(outputs[0].outputs.probs) == num_labels + + +@pytest.mark.skip_global_cleanup +def test_list_prompts(llm: LLM): + outputs = llm.classify([prompt, prompt_token_ids], use_tqdm=False) + assert len(outputs) == 2 + for i in range(len(outputs)): + assert isinstance(outputs[i], ClassificationRequestOutput) + assert outputs[i].prompt_token_ids == prompt_token_ids + assert len(outputs[i].outputs.probs) == num_labels + + +@pytest.mark.skip_global_cleanup +def test_token_classify(llm: LLM): + outputs = llm.encode(prompt, pooling_task="token_classify", use_tqdm=False) + assert len(outputs) == 1 + assert isinstance(outputs[0], PoolingRequestOutput) + assert outputs[0].prompt_token_ids == prompt_token_ids + assert outputs[0].outputs.data.shape == (len(prompt_token_ids), num_labels) + + @pytest.mark.skip_global_cleanup def test_pooling_params(llm: LLM): def get_outputs(use_activation): outputs = llm.classify( - prompts, + prompt, pooling_params=PoolingParams(use_activation=use_activation), use_tqdm=False, ) @@ -61,11 +101,14 @@ def test_pooling_params(llm: LLM): @pytest.mark.skip_global_cleanup -def test_token_classify(llm: LLM): - llm.encode(prompts, pooling_task="token_classify", use_tqdm=False) - - def test_score_api(llm: LLM): err_msg = "Score API is only enabled for num_labels == 1." with pytest.raises(ValueError, match=err_msg): llm.score("ping", "pong", use_tqdm=False) + + +@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"]) +def test_unsupported_tasks(llm: LLM, task: PoolingTask): + err_msg = f"Unsupported task: '{task}' Supported tasks.+" + with pytest.raises(ValueError, match=err_msg): + llm.encode(prompt, pooling_task=task, use_tqdm=False) diff --git a/tests/entrypoints/pooling/embed/test_online_vision.py b/tests/entrypoints/pooling/embed/test_online_vision.py index d4577188a..188f0ac86 100644 --- a/tests/entrypoints/pooling/embed/test_online_vision.py +++ b/tests/entrypoints/pooling/embed/test_online_vision.py @@ -10,12 +10,12 @@ from transformers import AutoProcessor from tests.utils import VLLM_PATH, RemoteOpenAIServer from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse from vllm.multimodal.media import MediaWithBytes -from vllm.multimodal.utils import fetch_image +from vllm.multimodal.utils import encode_image_url, fetch_image MODEL_NAME = "TIGER-Lab/VLM2Vec-Full" MAXIMUM_IMAGES = 2 -vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec_phi3v.jinja" +vlm2vec_jinja_path = VLLM_PATH / "examples/pooling/embed/template/vlm2vec_phi3v.jinja" assert vlm2vec_jinja_path.exists() # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA) @@ -26,6 +26,10 @@ TEST_IMAGE_ASSETS = [ "RGBA_comp.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png", ] +input_text = "The best thing about vLLM is that it supports many different models" +image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg" +image_base64 = {"url": encode_image_url(fetch_image(image_url))} + @pytest.fixture(scope="module") def server(): @@ -48,6 +52,81 @@ def server(): yield remote_server +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_chat_text_request(server: RemoteOpenAIServer, model_name: str): + messages = [ + { + "role": "user", + "content": input_text, + }, + ] + + # note: vlm2vec_phi3v.jinja + # Embedding models should only embed one message at a time. + + response = requests.post( + server.url_for("v1/embeddings"), + json={"model": model_name, "messages": messages}, + ) + response.raise_for_status() + + output = EmbeddingResponse.model_validate(response.json()) + assert len(output.data) == 1 + assert output.model == MODEL_NAME + assert len(output.data[0].embedding) == 3072 + assert output.usage.prompt_tokens == 14 + + +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_chat_image_url_request(server: RemoteOpenAIServer, model_name: str): + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Represent the user's input."}, + {"type": "image_url", "image_url": {"url": image_url}}, + ], + } + ] + + response = requests.post( + server.url_for("v1/embeddings"), + json={"model": model_name, "messages": messages}, + ) + response.raise_for_status() + + output = EmbeddingResponse.model_validate(response.json()) + assert len(output.data) == 1 + assert output.model == MODEL_NAME + assert len(output.data[0].embedding) == 3072 + assert output.usage.prompt_tokens == 767 + + +@pytest.mark.parametrize("model_name", [MODEL_NAME]) +def test_chat_image_base64_request(server: RemoteOpenAIServer, model_name: str): + messages = [ + { + "role": "user", + "content": [ + {"type": "text", "text": "Represent the user's input."}, + {"type": "image_url", "image_url": image_base64}, + ], + } + ] + + response = requests.post( + server.url_for("v1/embeddings"), + json={"model": model_name, "messages": messages}, + ) + response.raise_for_status() + + output = EmbeddingResponse.model_validate(response.json()) + assert len(output.data) == 1 + assert output.model == MODEL_NAME + assert len(output.data[0].embedding) == 3072 + assert output.usage.prompt_tokens == 767 + + def get_hf_prompt_tokens(model_name, content, image_url): processor = AutoProcessor.from_pretrained( model_name, trust_remote_code=True, num_crops=4 diff --git a/tests/renderers/test_hf.py b/tests/renderers/test_hf.py index e262e1f55..b6afcc559 100644 --- a/tests/renderers/test_hf.py +++ b/tests/renderers/test_hf.py @@ -428,13 +428,13 @@ def test_resolve_content_format_fallbacks(model, expected_format): ("template_chatglm.jinja", "string"), ("template_chatglm2.jinja", "string"), ("template_chatml.jinja", "string"), - ("template_dse_qwen2_vl.jinja", "openai"), ("template_falcon_180b.jinja", "string"), ("template_falcon.jinja", "string"), ("template_inkbot.jinja", "string"), ("template_teleflm.jinja", "string"), - ("template_vlm2vec_phi3v.jinja", "openai"), - ("template_vlm2vec_qwen2vl.jinja", "openai"), + ("pooling/embed/template/dse_qwen2_vl.jinja", "openai"), + ("pooling/embed/template/vlm2vec_phi3v.jinja", "openai"), + ("pooling/embed/template/vlm2vec_qwen2vl.jinja", "openai"), ("tool_chat_template_granite_20b_fc.jinja", "string"), ("tool_chat_template_hermes.jinja", "string"), ("tool_chat_template_internlm2_tool.jinja", "string"), diff --git a/vllm/entrypoints/pooling/base/protocol.py b/vllm/entrypoints/pooling/base/protocol.py index 19654db50..86dc12cbd 100644 --- a/vllm/entrypoints/pooling/base/protocol.py +++ b/vllm/entrypoints/pooling/base/protocol.py @@ -40,6 +40,21 @@ class PoolingBasicRequestMixin(OpenAIBaseModel): "if the served model does not use priority scheduling." ), ) + mm_processor_kwargs: dict[str, Any] | None = Field( + default=None, + description="Additional kwargs to pass to the HF processor.", + ) + cache_salt: str | None = Field( + default=None, + description=( + "If specified, the prefix cache will be salted with the provided " + "string to prevent an attacker to guess prompts in multi-user " + "environments. The salt should be random, protected from " + "access by 3rd parties, and long enough to be " + "unpredictable (e.g., 43 characters base64-encoded, corresponding " + "to 256 bit)." + ), + ) # --8<-- [end:pooling-common-extra-params] diff --git a/vllm/entrypoints/pooling/classify/protocol.py b/vllm/entrypoints/pooling/classify/protocol.py index 55641561d..3c4bbd8c2 100644 --- a/vllm/entrypoints/pooling/classify/protocol.py +++ b/vllm/entrypoints/pooling/classify/protocol.py @@ -2,7 +2,7 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time -from typing import Any, TypeAlias +from typing import TypeAlias from pydantic import Field @@ -48,12 +48,6 @@ class ClassificationCompletionRequest( class ClassificationChatRequest( PoolingBasicRequestMixin, ChatRequestMixin, ClassifyRequestMixin ): - # --8<-- [start:chat-classification-extra-params] - mm_processor_kwargs: dict[str, Any] | None = Field( - default=None, - description=("Additional kwargs to pass to the HF processor."), - ) - def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams: encoder_config = model_config.encoder_config or {} diff --git a/vllm/entrypoints/pooling/embed/protocol.py b/vllm/entrypoints/pooling/embed/protocol.py index 61bec5ae0..4f83105f2 100644 --- a/vllm/entrypoints/pooling/embed/protocol.py +++ b/vllm/entrypoints/pooling/embed/protocol.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time -from typing import Any, TypeAlias +from typing import TypeAlias from pydantic import Field @@ -78,11 +78,6 @@ class EmbeddingCompletionRequest( class EmbeddingChatRequest( PoolingBasicRequestMixin, ChatRequestMixin, EmbedRequestMixin ): - mm_processor_kwargs: dict[str, Any] | None = Field( - default=None, - description=("Additional kwargs to pass to the HF processor."), - ) - def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams: encoder_config = model_config.encoder_config or {} diff --git a/vllm/entrypoints/pooling/pooling/protocol.py b/vllm/entrypoints/pooling/pooling/protocol.py index 50d769e1f..ab2d82d8e 100644 --- a/vllm/entrypoints/pooling/pooling/protocol.py +++ b/vllm/entrypoints/pooling/pooling/protocol.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time -from typing import Any, Generic, TypeAlias, TypeVar +from typing import Generic, TypeAlias, TypeVar from pydantic import Field @@ -65,11 +65,6 @@ class PoolingChatRequest( ): task: PoolingTask | None = None - mm_processor_kwargs: dict[str, Any] | None = Field( - default=None, - description=("Additional kwargs to pass to the HF processor."), - ) - def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams: encoder_config = model_config.encoder_config or {} diff --git a/vllm/entrypoints/pooling/score/protocol.py b/vllm/entrypoints/pooling/score/protocol.py index 9fe9f2544..a85ed5d70 100644 --- a/vllm/entrypoints/pooling/score/protocol.py +++ b/vllm/entrypoints/pooling/score/protocol.py @@ -1,7 +1,7 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import time -from typing import Any, TypeAlias +from typing import TypeAlias from pydantic import BaseModel, Field @@ -23,13 +23,6 @@ from vllm.utils import random_uuid class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin): - # --8<-- [start:score-extra-params] - mm_processor_kwargs: dict[str, Any] | None = Field( - default=None, - description=("Additional kwargs to pass to the HF processor."), - ) - # --8<-- [end:score-extra-params] - def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams: encoder_config = model_config.encoder_config or {} @@ -106,13 +99,6 @@ class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin): documents: ScoreInputs top_n: int = Field(default_factory=lambda: 0) - # --8<-- [start:rerank-extra-params] - mm_processor_kwargs: dict[str, Any] | None = Field( - default=None, - description=("Additional kwargs to pass to the HF processor."), - ) - # --8<-- [end:rerank-extra-params] - def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams: encoder_config = model_config.encoder_config or {} diff --git a/vllm/utils/print_utils.py b/vllm/utils/print_utils.py new file mode 100644 index 000000000..8f8af6032 --- /dev/null +++ b/vllm/utils/print_utils.py @@ -0,0 +1,7 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project + + +def print_embeddings(embeds: list[float]): + embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds + print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")