[Frontend][last/5] Make pooling entrypoints request schema consensus. (#31127)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
2026-02-09 14:42:38 +08:00
parent 7c233dbb36
commit 22b64948f6
24 changed files with 659 additions and 613 deletions
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -514,7 +514,7 @@ steps:
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -453,7 +453,7 @@ steps:
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
    # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
--- a/.buildkite/test_areas/misc.yaml
+++ b/.buildkite/test_areas/misc.yaml
@@ -72,7 +72,7 @@ steps:
    - python3 offline_inference/vision_language_multi_image.py --seed 0
    - python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
     # for pooling models
-    - python3 pooling/pooling/vision_language_pooling.py --seed 0
+    - python3 pooling/embed/vision_embedding_offline.py --seed 0
    # for features demo
    - python3 offline_inference/prefix_caching.py
    - python3 offline_inference/llm_engine_example.py
--- a/docs/features/multimodal_inputs.md
+++ b/docs/features/multimodal_inputs.md
@@ -510,7 +510,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
    If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.

    For certain models, we provide alternative chat templates inside [examples](../../examples).
-    For example, VLM2Vec uses [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.
+    For example, VLM2Vec uses [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../examples/pooling/embed/template/vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.

 ### Image Inputs

--- a/docs/serving/openai_compatible_server.md
+++ b/docs/serving/openai_compatible_server.md
@@ -311,7 +311,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
    vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
      --trust-remote-code \
      --max-model-len 4096 \
-      --chat-template examples/template_vlm2vec_phi3v.jinja
+      --chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja
    ```

    !!! important
@@ -319,7 +319,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
        to run this model in embedding mode instead of text generation mode.

        The custom chat template is completely different from the original one for this model,
-        and can be found here: [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja)
+        and can be found here: [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../examples/pooling/embed/template/vlm2vec_phi3v.jinja)

    Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:

@@ -359,14 +359,14 @@ and passing a list of `messages` in the request. Refer to the examples below for
    vllm serve MrLight/dse-qwen2-2b-mrl-v1 --runner pooling \
      --trust-remote-code \
      --max-model-len 8192 \
-      --chat-template examples/template_dse_qwen2_vl.jinja
+      --chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja
    ```

    !!! important
        Like with VLM2Vec, we have to explicitly pass `--runner pooling`.

        Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
-        by a custom chat template: [examples/template_dse_qwen2_vl.jinja](../../examples/template_dse_qwen2_vl.jinja)
+        by a custom chat template: [examples/pooling/embed/template/dse_qwen2_vl.jinja](../../examples/pooling/embed/template/dse_qwen2_vl.jinja)

    !!! important
        `MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
@@ -532,7 +532,7 @@ The following [sampling parameters](../api/README.md#inference-parameters) are s
 ??? code

    ```python
-    --8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
+    --8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:transcription-sampling-params"
    ```

 The following extra parameters are supported:
@@ -540,7 +540,7 @@ The following extra parameters are supported:
 ??? code

    ```python
-    --8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
+    --8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:transcription-extra-params"
    ```

 ### Translations API
@@ -560,13 +560,13 @@ Code example: [examples/online_serving/openai_translation_client.py](../../examp
 The following [sampling parameters](../api/README.md#inference-parameters) are supported.

 ```python
--8<-- "vllm/entrypoints/openai/protocol.py:translation-sampling-params"
+--8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:translation-sampling-params"
 ```

 The following extra parameters are supported:

 ```python
--8<-- "vllm/entrypoints/openai/protocol.py:translation-extra-params"
+--8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:translation-extra-params"
 ```

 ### Realtime API
@@ -960,23 +960,29 @@ You can pass multi-modal inputs to scoring models by passing `content` including
            json={
                "model": "jinaai/jina-reranker-m0",
                "queries": "slm markdown",
-                "documents": {
+                "documents": [
+                    {
                        "content": [
                            {
                                "type": "image_url",
                                "image_url": {
                                    "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
                                },
+                            }
+                        ],
                    },
+                    {
+                        "content": [
                            {
                                "type": "image_url",
                                "image_url": {
-                                "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
+                                    "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
                                },
+                            }
+                        ]
                    },
                ],
            },
-            },
        )
        response.raise_for_status()
        response_json = response.json()
@@ -1001,7 +1007,6 @@ The following Score API parameters are supported:

 ```python
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
--8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
 ```

 The following extra parameters are supported:
@@ -1009,7 +1014,6 @@ The following extra parameters are supported:
 ```python
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
--8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
 ```

 ### Re-rank API
@@ -1092,7 +1096,6 @@ The following Re-rank API parameters are supported:
 ```python
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
--8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
 ```

 The following extra parameters are supported:
@@ -1100,7 +1103,6 @@ The following extra parameters are supported:
 ```python
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
 --8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
--8<-- "vllm/entrypoints/pooling/score/protocol.py:rerank-extra-params"
 ```

 ## Ray Serve LLM
--- a/examples/pooling/classify/vision_classification_online.py
+++ b/examples/pooling/classify/vision_classification_online.py
@@ -0,0 +1,110 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+# ruff: noqa: E501
+"""Example Python client for multimodal classification API using vLLM API server
+NOTE:
+    start a supported multimodal classification model server with `vllm serve`, e.g.
+    vllm serve muziyongshixin/Qwen2.5-VL-7B-for-VideoCls \
+         --runner pooling \
+         --max-model-len 5000 \
+         --limit-mm-per-prompt '{"video": 1}' \
+         --hf-overrides '{"text_config": {"architectures": ["Qwen2_5_VLForSequenceClassification"]}}'
+"""
+
+import argparse
+import pprint
+
+import requests
+
+from vllm.multimodal.utils import encode_image_url, fetch_image
+
+input_text = "This product was excellent and exceeded my expectations"
+image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
+image_base64 = {"url": encode_image_url(fetch_image(image_url))}
+video_url = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
+
+
+def parse_args():
+    parse = argparse.ArgumentParser()
+    parse.add_argument("--host", type=str, default="localhost")
+    parse.add_argument("--port", type=int, default=8000)
+    return parse.parse_args()
+
+
+def main(args):
+    base_url = f"http://{args.host}:{args.port}"
+    models_url = base_url + "/v1/models"
+    classify_url = base_url + "/classify"
+
+    response = requests.get(models_url)
+    model_name = response.json()["data"][0]["id"]
+
+    print("Text classification output:")
+    messages = [
+        {
+            "role": "assistant",
+            "content": "Please classify this text request.",
+        },
+        {
+            "role": "user",
+            "content": input_text,
+        },
+    ]
+    response = requests.post(
+        classify_url,
+        json={"model": model_name, "messages": messages},
+    )
+    pprint.pprint(response.json())
+
+    print("Image url classification output:")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this image."},
+                {"type": "image_url", "image_url": {"url": image_url}},
+            ],
+        }
+    ]
+    response = requests.post(
+        classify_url,
+        json={"model": model_name, "messages": messages},
+    )
+    pprint.pprint(response.json())
+
+    print("Image base64 classification output:")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this image."},
+                {"type": "image_url", "image_url": image_base64},
+            ],
+        }
+    ]
+    response = requests.post(
+        classify_url,
+        json={"model": model_name, "messages": messages},
+    )
+    pprint.pprint(response.json())
+
+    print("Video url classification output:")
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Please classify this video."},
+                {"type": "video_url", "video_url": {"url": video_url}},
+            ],
+        }
+    ]
+    response = requests.post(
+        classify_url,
+        json={"model": model_name, "messages": messages},
+    )
+    pprint.pprint(response.json())
+
+
+if __name__ == "__main__":
+    args = parse_args()
+    main(args)
--- a/examples/pooling/embed/template/dse_qwen2_vl.jinja
+++ b/examples/pooling/embed/template/dse_qwen2_vl.jinja
--- a/examples/pooling/embed/template/vlm2vec_phi3v.jinja
+++ b/examples/pooling/embed/template/vlm2vec_phi3v.jinja
--- a/examples/pooling/embed/template/vlm2vec_qwen2vl.jinja
+++ b/examples/pooling/embed/template/vlm2vec_qwen2vl.jinja
--- a/examples/pooling/embed/vision_embedding_offline.py
+++ b/examples/pooling/embed/vision_embedding_offline.py
@@ -11,23 +11,79 @@ on HuggingFace model repository.

 import argparse
 from dataclasses import asdict
+from pathlib import Path

 from PIL.Image import Image

 from vllm import LLM, EngineArgs
 from vllm.multimodal.utils import fetch_image
+from vllm.utils.print_utils import print_embeddings
+
+ROOT_DIR = Path(__file__).parent.parent.parent
+EMBED_TEMPLATE_DIR = ROOT_DIR / "pooling/embed/template/"

 image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
 text = "A cat standing in the snow."
 multi_modal_data = {"image": fetch_image(image_url)}


-def print_embeddings(embeds: list[float]):
-    embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
-    print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
+def run_clip(seed: int):
+    engine_args = EngineArgs(
+        model="openai/clip-vit-base-patch32",
+        runner="pooling",
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    llm = LLM(**asdict(engine_args) | {"seed": seed})
+
+    print("Text embedding output:")
+    outputs = llm.embed(text, use_tqdm=False)
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image embedding output:")
+    prompt = ""  # For image input, make sure that the prompt text is empty
+    outputs = llm.embed(
+        {
+            "prompt": prompt,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)


-def run_qwen3_vl():
+def run_e5_v(seed: int):
+    engine_args = EngineArgs(
+        model="royokong/e5-v",
+        runner="pooling",
+        max_model_len=4096,
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    llm = LLM(**asdict(engine_args) | {"seed": seed})
+
+    llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"  # noqa: E501
+
+    print("Text embedding output:")
+    prompt_text = llama3_template.format(
+        f"{text}\nSummary above sentence in one word: "
+    )
+    outputs = llm.embed(prompt_text, use_tqdm=False)
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image embedding output:")
+    prompt_image = llama3_template.format("<image>\nSummary above image in one word: ")
+    outputs = llm.embed(
+        {
+            "prompt": prompt_image,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)
+
+
+def run_qwen3_vl(seed: int):
    try:
        from qwen_vl_utils import smart_resize
    except ModuleNotFoundError:
@@ -61,20 +117,20 @@ def run_qwen3_vl():
    )
    default_instruction = "Represent the user's input."
    image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
-    text_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
-    image_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
-    image_text_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"
+    prompt_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
+    prompt_image = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
+    prompt_image_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"

-    llm = LLM(**asdict(engine_args))
+    llm = LLM(**asdict(engine_args) | {"seed": seed})

    print("Text embedding output:")
-    outputs = llm.embed(text_prompt, use_tqdm=False)
+    outputs = llm.embed(prompt_text, use_tqdm=False)
    print_embeddings(outputs[0].outputs.embedding)

    print("Image embedding output:")
    outputs = llm.embed(
        {
-            "prompt": image_prompt,
+            "prompt": prompt_image,
            "multi_modal_data": multi_modal_data,
        },
        use_tqdm=False,
@@ -84,7 +140,162 @@ def run_qwen3_vl():
    print("Image+Text embedding output:")
    outputs = llm.embed(
        {
-            "prompt": image_text_prompt,
+            "prompt": prompt_image_text,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)
+
+
+def run_siglip(seed: int):
+    engine_args = EngineArgs(
+        model="google/siglip-base-patch16-224",
+        runner="pooling",
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    llm = LLM(**asdict(engine_args) | {"seed": seed})
+
+    print("Text embedding output:")
+    outputs = llm.embed(text, use_tqdm=False)
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image embedding output:")
+    prompt = ""  # For image input, make sure that the prompt text is empty
+    outputs = llm.embed(
+        {
+            "prompt": prompt,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)
+
+
+def run_vlm2vec_phi3v(seed: int):
+    engine_args = EngineArgs(
+        model="TIGER-Lab/VLM2Vec-Full",
+        runner="pooling",
+        max_model_len=4096,
+        trust_remote_code=True,
+        mm_processor_kwargs={"num_crops": 4},
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    llm = LLM(**asdict(engine_args) | {"seed": seed})
+    image_token = "<|image_1|>"
+
+    print("Text embedding output:")
+    prompt_text = f"Find me an everyday image that matches the given caption: {text}"
+    outputs = llm.embed(prompt_text, use_tqdm=False)
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image embedding output:")
+    prompt_image = f"{image_token} Find a day-to-day image that looks similar to the provided image."  # noqa: E501
+    outputs = llm.embed(
+        {
+            "prompt": prompt_image,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image+Text embedding output:")
+    prompt_image_text = (
+        f"{image_token} Represent the given image with the following question: {text}"  # noqa: E501
+    )
+    outputs = llm.embed(
+        {
+            "prompt": prompt_image_text,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)
+
+
+def run_vlm2vec_qwen2vl(seed: int):
+    # vLLM does not support LoRA adapters on multi-modal encoder,
+    # so we merge the weights first
+    from huggingface_hub.constants import HF_HUB_CACHE
+    from peft import PeftConfig, PeftModel
+    from transformers import AutoModelForImageTextToText, AutoProcessor
+
+    from vllm.entrypoints.chat_utils import load_chat_template
+
+    model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B"
+
+    base_model = AutoModelForImageTextToText.from_pretrained(model_id)
+    lora_model = PeftModel.from_pretrained(
+        base_model,
+        model_id,
+        config=PeftConfig.from_pretrained(model_id),
+    )
+    model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
+    model._hf_peft_config_loaded = False  # Needed to save the merged model
+
+    processor = AutoProcessor.from_pretrained(
+        model_id,
+        # `min_pixels` and `max_pixels` are deprecated for
+        # transformers `preprocessor_config.json`
+        size={"shortest_edge": 3136, "longest_edge": 12845056},
+    )
+    processor.chat_template = load_chat_template(
+        # The original chat template is not correct
+        EMBED_TEMPLATE_DIR / "vlm2vec_qwen2vl.jinja",
+    )
+
+    merged_path = str(
+        Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm")
+    )
+    print(f"Saving merged model to {merged_path}...")
+    print(
+        "NOTE: This directory is not tracked by `huggingface_hub` "
+        "so you have to delete this manually if you don't want it anymore."
+    )
+    model.save_pretrained(merged_path)
+    processor.save_pretrained(merged_path)
+    print("Done!")
+
+    engine_args = EngineArgs(
+        model=merged_path,
+        runner="pooling",
+        max_model_len=4096,
+        mm_processor_kwargs={
+            "min_pixels": 3136,
+            "max_pixels": 12845056,
+        },
+        limit_mm_per_prompt={"image": 1},
+    )
+
+    llm = LLM(**asdict(engine_args) | {"seed": seed})
+    image_token = "<|image_pad|>"
+
+    print("Text embedding output:")
+    prompt_text = f"Find me an everyday image that matches the given caption: {text}"
+    outputs = llm.embed(prompt_text, use_tqdm=False)
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image embedding output:")
+    prompt_image = f"{image_token} Find a day-to-day image that looks similar to the provided image."  # noqa: E501
+    outputs = llm.embed(
+        {
+            "prompt": prompt_image,
+            "multi_modal_data": multi_modal_data,
+        },
+        use_tqdm=False,
+    )
+    print_embeddings(outputs[0].outputs.embedding)
+
+    print("Image+Text embedding output:")
+    prompt_image_text = (
+        f"{image_token} Represent the given image with the following question: {text}"  # noqa: E501
+    )
+    outputs = llm.embed(
+        {
+            "prompt": prompt_image_text,
            "multi_modal_data": multi_modal_data,
        },
        use_tqdm=False,
@@ -93,7 +304,12 @@ def run_qwen3_vl():


 model_example_map = {
+    "clip": run_clip,
+    "e5_v": run_e5_v,
    "qwen3_vl": run_qwen3_vl,
+    "siglip": run_siglip,
+    "vlm2vec_phi3v": run_vlm2vec_phi3v,
+    "vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
 }


@@ -103,16 +319,23 @@ def parse_args():
    )
    parser.add_argument(
        "--model",
+        "-m",
        type=str,
+        default="vlm2vec_phi3v",
        choices=model_example_map.keys(),
-        required=True,
        help="The name of the embedding model.",
    )
+    parser.add_argument(
+        "--seed",
+        type=int,
+        default=0,
+        help="Set the seed when initializing `vllm.LLM`.",
+    )
    return parser.parse_args()


 def main(args):
-    model_example_map[args.model]()
+    model_example_map[args.model](args.seed)


 if __name__ == "__main__":
--- a/examples/pooling/embed/vision_embedding_online.py
+++ b/examples/pooling/embed/vision_embedding_online.py
@@ -17,6 +17,8 @@ from openai.types.chat import ChatCompletionMessageParam
 from openai.types.create_embedding_response import CreateEmbeddingResponse
 from PIL import Image

+from vllm.utils.print_utils import print_embeddings
+
 # Modify OpenAI's API key and API base to use vLLM's API server.
 openai_api_key = "EMPTY"
 openai_api_base = "http://localhost:8000/v1"
@@ -51,11 +53,6 @@ def create_chat_embeddings(
    )


-def print_embeddings(embeds):
-    embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
-    print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
-
-
 def run_clip(client: OpenAI, model: str):
    """
    Start the server using:
@@ -105,7 +102,7 @@ def run_dse_qwen2_vl(client: OpenAI, model: str):
        --runner pooling \
        --trust-remote-code \
        --max-model-len 8192 \
-        --chat-template examples/template_dse_qwen2_vl.jinja
+        --chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja
    """
    response = create_chat_embeddings(
        client,
@@ -316,7 +313,7 @@ def run_vlm2vec(client: OpenAI, model: str):
        --runner pooling \
        --trust-remote-code \
        --max-model-len 4096 \
-        --chat-template examples/template_vlm2vec_phi3v.jinja
+        --chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja
    """

    response = create_chat_embeddings(
--- a/examples/pooling/pooling/vision_language_pooling.py
+++ b/examples/pooling/pooling/vision_language_pooling.py
@@ -1,441 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-"""
-This example shows how to use vLLM for running offline inference with
-the correct prompt format on vision language models for multimodal pooling.
-
-For most models, the prompt format should follow corresponding examples
-on HuggingFace model repository.
-"""
-
-from argparse import Namespace
-from dataclasses import asdict
-from pathlib import Path
-from typing import Literal, NamedTuple, TypeAlias, TypedDict, get_args
-
-from PIL.Image import Image
-
-from vllm import LLM, EngineArgs
-from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
-from vllm.multimodal.utils import fetch_image
-from vllm.utils.argparse_utils import FlexibleArgumentParser
-
-ROOT_DIR = Path(__file__).parent.parent.parent
-EXAMPLES_DIR = ROOT_DIR / "examples"
-
-
-class TextQuery(TypedDict):
-    modality: Literal["text"]
-    text: str
-
-
-class ImageQuery(TypedDict):
-    modality: Literal["image"]
-    image: Image
-
-
-class TextImageQuery(TypedDict):
-    modality: Literal["text+image"]
-    text: str
-    image: Image
-
-
-class TextImagesQuery(TypedDict):
-    modality: Literal["text+images"]
-    text: str
-    image: ScoreMultiModalParam
-
-
-QueryModality = Literal["text", "image", "text+image", "text+images"]
-Query: TypeAlias = TextQuery | ImageQuery | TextImageQuery | TextImagesQuery
-
-
-class ModelRequestData(NamedTuple):
-    engine_args: EngineArgs
-    prompt: str | None = None
-    image: Image | None = None
-    query: str | None = None
-    documents: ScoreMultiModalParam | None = None
-
-
-def run_clip(query: Query) -> ModelRequestData:
-    if query["modality"] == "text":
-        prompt = query["text"]
-        image = None
-    elif query["modality"] == "image":
-        prompt = ""  # For image input, make sure that the prompt text is empty
-        image = query["image"]
-    else:
-        modality = query["modality"]
-        raise ValueError(f"Unsupported query modality: '{modality}'")
-
-    engine_args = EngineArgs(
-        model="openai/clip-vit-base-patch32",
-        runner="pooling",
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image=image,
-    )
-
-
-def run_e5_v(query: Query) -> ModelRequestData:
-    llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n"  # noqa: E501
-
-    if query["modality"] == "text":
-        text = query["text"]
-        prompt = llama3_template.format(f"{text}\nSummary above sentence in one word: ")
-        image = None
-    elif query["modality"] == "image":
-        prompt = llama3_template.format("<image>\nSummary above image in one word: ")
-        image = query["image"]
-    else:
-        modality = query["modality"]
-        raise ValueError(f"Unsupported query modality: '{modality}'")
-
-    engine_args = EngineArgs(
-        model="royokong/e5-v",
-        runner="pooling",
-        max_model_len=4096,
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image=image,
-    )
-
-
-def run_jinavl_reranker(query: Query) -> ModelRequestData:
-    if query["modality"] != "text+images":
-        raise ValueError(f"Unsupported query modality: '{query['modality']}'")
-
-    engine_args = EngineArgs(
-        model="jinaai/jina-reranker-m0",
-        runner="pooling",
-        max_model_len=32768,
-        trust_remote_code=True,
-        mm_processor_kwargs={
-            "min_pixels": 3136,
-            "max_pixels": 602112,
-        },
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        query=query["text"],
-        documents=query["image"],
-    )
-
-
-def run_qwen3_vl(query: Query) -> ModelRequestData:
-    image_placeholder = "<vision_start><|image_pad|><vision_end>"
-    if query["modality"] == "text":
-        prompt = query["text"]
-        image = None
-    elif query["modality"] == "image":
-        prompt = image_placeholder
-        image = query["image"]
-    elif query["modality"] == "text+image":
-        text = query["text"]
-        prompt = f"{image_placeholder}\n{text}"
-        image = query["image"]
-    else:
-        modality = query["modality"]
-        raise ValueError(f"Unsupported query modality: '{modality}'")
-
-    engine_args = EngineArgs(
-        model="Qwen/Qwen3-VL-Embedding-2B",
-        runner="pooling",
-        max_model_len=8192,
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image=image,
-    )
-
-
-def run_siglip(query: Query) -> ModelRequestData:
-    if query["modality"] == "text":
-        prompt = query["text"]
-        image = None
-    elif query["modality"] == "image":
-        prompt = ""  # For image input, make sure that the prompt text is empty
-        image = query["image"]
-    else:
-        modality = query["modality"]
-        raise ValueError(f"Unsupported query modality: '{modality}'")
-
-    engine_args = EngineArgs(
-        model="google/siglip-base-patch16-224",
-        runner="pooling",
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image=image,
-    )
-
-
-def _get_vlm2vec_prompt_image(query: Query, image_token: str):
-    if query["modality"] == "text":
-        text = query["text"]
-        prompt = f"Find me an everyday image that matches the given caption: {text}"
-        image = None
-    elif query["modality"] == "image":
-        prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image."  # noqa: E501
-        image = query["image"]
-    elif query["modality"] == "text+image":
-        text = query["text"]
-        prompt = f"{image_token} Represent the given image with the following question: {text}"  # noqa: E501
-        image = query["image"]
-    else:
-        modality = query["modality"]
-        raise ValueError(f"Unsupported query modality: {modality!r}")
-
-    return prompt, image
-
-
-def run_vlm2vec_phi3v(query: Query) -> ModelRequestData:
-    prompt, image = _get_vlm2vec_prompt_image(query, "<|image_1|>")
-
-    engine_args = EngineArgs(
-        model="TIGER-Lab/VLM2Vec-Full",
-        runner="pooling",
-        max_model_len=4096,
-        trust_remote_code=True,
-        mm_processor_kwargs={"num_crops": 4},
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image=image,
-    )
-
-
-def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
-    # vLLM does not support LoRA adapters on multi-modal encoder,
-    # so we merge the weights first
-    from huggingface_hub.constants import HF_HUB_CACHE
-    from peft import PeftConfig, PeftModel
-    from transformers import AutoModelForImageTextToText, AutoProcessor
-
-    from vllm.entrypoints.chat_utils import load_chat_template
-
-    model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B"
-
-    base_model = AutoModelForImageTextToText.from_pretrained(model_id)
-    lora_model = PeftModel.from_pretrained(
-        base_model,
-        model_id,
-        config=PeftConfig.from_pretrained(model_id),
-    )
-    model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
-    model._hf_peft_config_loaded = False  # Needed to save the merged model
-
-    processor = AutoProcessor.from_pretrained(
-        model_id,
-        # `min_pixels` and `max_pixels` are deprecated for
-        # transformers `preprocessor_config.json`
-        size={"shortest_edge": 3136, "longest_edge": 12845056},
-    )
-    processor.chat_template = load_chat_template(
-        # The original chat template is not correct
-        EXAMPLES_DIR / "template_vlm2vec_qwen2vl.jinja",
-    )
-
-    merged_path = str(
-        Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm")
-    )
-    print(f"Saving merged model to {merged_path}...")
-    print(
-        "NOTE: This directory is not tracked by `huggingface_hub` "
-        "so you have to delete this manually if you don't want it anymore."
-    )
-    model.save_pretrained(merged_path)
-    processor.save_pretrained(merged_path)
-    print("Done!")
-
-    prompt, image = _get_vlm2vec_prompt_image(query, "<|image_pad|>")
-
-    engine_args = EngineArgs(
-        model=merged_path,
-        runner="pooling",
-        max_model_len=4096,
-        mm_processor_kwargs={
-            "min_pixels": 3136,
-            "max_pixels": 12845056,
-        },
-        limit_mm_per_prompt={"image": 1},
-    )
-
-    return ModelRequestData(
-        engine_args=engine_args,
-        prompt=prompt,
-        image=image,
-    )
-
-
-def get_query(modality: QueryModality):
-    if modality == "text":
-        return TextQuery(modality="text", text="A dog sitting in the grass")
-
-    if modality == "image":
-        return ImageQuery(
-            modality="image",
-            image=fetch_image(
-                "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/eskimo.jpg"  # noqa: E501
-            ),
-        )
-
-    if modality == "text+image":
-        return TextImageQuery(
-            modality="text+image",
-            text="A cat standing in the snow.",
-            image=fetch_image(
-                "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"  # noqa: E501
-            ),
-        )
-
-    if modality == "text+images":
-        return TextImagesQuery(
-            modality="text+images",
-            text="slm markdown",
-            image={
-                "content": [
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
-                        },
-                    },
-                    {
-                        "type": "image_url",
-                        "image_url": {
-                            "url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
-                        },
-                    },
-                ]
-            },
-        )
-
-    msg = f"Modality {modality} is not supported."
-    raise ValueError(msg)
-
-
-def run_encode(model: str, modality: QueryModality, seed: int):
-    query = get_query(modality)
-    req_data = model_example_map[model](query)
-
-    # Disable other modalities to save memory
-    default_limits = {"image": 0, "video": 0, "audio": 0}
-    req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
-        req_data.engine_args.limit_mm_per_prompt or {}
-    )
-
-    engine_args = asdict(req_data.engine_args) | {"seed": seed}
-    llm = LLM(**engine_args)
-
-    mm_data = {}
-    if req_data.image is not None:
-        mm_data["image"] = req_data.image
-
-    outputs = llm.embed(
-        {
-            "prompt": req_data.prompt,
-            "multi_modal_data": mm_data,
-        }
-    )
-
-    print("-" * 50)
-    for output in outputs:
-        print(output.outputs.embedding)
-        print("-" * 50)
-
-
-def run_score(model: str, modality: QueryModality, seed: int):
-    query = get_query(modality)
-    req_data = model_example_map[model](query)
-
-    engine_args = asdict(req_data.engine_args) | {"seed": seed}
-    llm = LLM(**engine_args)
-
-    outputs = llm.score(req_data.query, req_data.documents)
-
-    print("-" * 30)
-    print([output.outputs.score for output in outputs])
-    print("-" * 30)
-
-
-model_example_map = {
-    "clip": run_clip,
-    "e5_v": run_e5_v,
-    "jinavl_reranker": run_jinavl_reranker,
-    "qwen3_vl": run_qwen3_vl,
-    "siglip": run_siglip,
-    "vlm2vec_phi3v": run_vlm2vec_phi3v,
-    "vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
-}
-
-
-def parse_args():
-    parser = FlexibleArgumentParser(
-        description="Demo on using vLLM for offline inference with "
-        "vision language models for multimodal pooling tasks."
-    )
-    parser.add_argument(
-        "--model-name",
-        "-m",
-        type=str,
-        default="vlm2vec_phi3v",
-        choices=model_example_map.keys(),
-        help="The name of the embedding model.",
-    )
-    parser.add_argument(
-        "--task",
-        "-t",
-        type=str,
-        default="embedding",
-        choices=["embedding", "scoring"],
-        help="The task type.",
-    )
-    parser.add_argument(
-        "--modality",
-        type=str,
-        default="image",
-        choices=get_args(QueryModality),
-        help="Modality of the input.",
-    )
-    parser.add_argument(
-        "--seed",
-        type=int,
-        default=0,
-        help="Set the seed when initializing `vllm.LLM`.",
-    )
-    return parser.parse_args()
-
-
-def main(args: Namespace):
-    if args.task == "embedding":
-        run_encode(args.model_name, args.modality, args.seed)
-    elif args.task == "scoring":
-        run_score(args.model_name, args.modality, args.seed)
-    else:
-        raise ValueError(f"Unsupported task: {args.task}")
-
-
-if __name__ == "__main__":
-    args = parse_args()
-    main(args)
--- a/examples/pooling/score/vision_rerank_api_online.py
+++ b/examples/pooling/score/vision_rerank_api_online.py
@@ -30,6 +30,7 @@ document = (
    "as the dog offers its paw in a heartwarming display of companionship and trust."
 )
 image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+video_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
 documents = [
    {
        "type": "text",
@@ -43,6 +44,10 @@ documents = [
        "type": "image_url",
        "image_url": {"url": encode_image_url(fetch_image(image_url))},
    },
+    {
+        "type": "video_url",
+        "video_url": {"url": video_url},
+    },
 ]


@@ -89,6 +94,15 @@ def main(args):
    response = requests.post(rerank_url, json=prompt)
    pprint.pprint(response.json())

+    print("Query: string & Document: video url")
+    prompt = {
+        "model": model,
+        "query": query,
+        "documents": {"content": [documents[3]]},
+    }
+    response = requests.post(rerank_url, json=prompt)
+    pprint.pprint(response.json())
+
    print("Query: string & Document: text + image url")
    prompt = {
        "model": model,
--- a/examples/pooling/score/vision_reranker_offline.py
+++ b/examples/pooling/score/vision_reranker_offline.py
@@ -15,20 +15,47 @@ from pathlib import Path
 from typing import NamedTuple

 from vllm import LLM, EngineArgs
-from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
+from vllm.multimodal.utils import encode_image_url, fetch_image
 from vllm.utils.argparse_utils import FlexibleArgumentParser

 TEMPLATE_HOME = Path(__file__).parent / "template"


+query = "A woman playing with her dog on a beach at sunset."
+document = (
+    "A woman shares a joyful moment with her golden retriever on a sun-drenched "
+    "beach at sunset, as the dog offers its paw in a heartwarming display of "
+    "companionship and trust."
+)
+image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+video_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
+documents = [
+    {
+        "type": "text",
+        "text": document,
+    },
+    {
+        "type": "image_url",
+        "image_url": {"url": image_url},
+    },
+    {
+        "type": "image_url",
+        "image_url": {"url": encode_image_url(fetch_image(image_url))},
+    },
+    {
+        "type": "video_url",
+        "video_url": {"url": video_url},
+    },
+]
+
+
 class RerankModelData(NamedTuple):
    engine_args: EngineArgs
    chat_template: str | None = None
+    modality: set[str] = {}


-def run_jinavl_reranker(modality: str) -> RerankModelData:
-    assert modality == "image"
-
+def run_jinavl_reranker() -> RerankModelData:
    engine_args = EngineArgs(
        model="jinaai/jina-reranker-m0",
        runner="pooling",
@@ -38,19 +65,15 @@ def run_jinavl_reranker(modality: str) -> RerankModelData:
            "min_pixels": 3136,
            "max_pixels": 602112,
        },
-        limit_mm_per_prompt={modality: 1},
-    )
-    return RerankModelData(
-        engine_args=engine_args,
    )
+    return RerankModelData(engine_args=engine_args, modality={"image"})


-def run_qwen3_vl_reranker(modality: str) -> RerankModelData:
+def run_qwen3_vl_reranker() -> RerankModelData:
    engine_args = EngineArgs(
        model="Qwen/Qwen3-VL-Reranker-2B",
        runner="pooling",
        max_model_len=16384,
-        limit_mm_per_prompt={modality: 1},
        # HuggingFace model configuration overrides required for compatibility
        hf_overrides={
            # Manually route to sequence classification architecture
@@ -71,10 +94,11 @@ def run_qwen3_vl_reranker(modality: str) -> RerankModelData:
    return RerankModelData(
        engine_args=engine_args,
        chat_template=chat_template,
+        modality={"image", "video"},
    )


-model_example_map: dict[str, Callable[[str], RerankModelData]] = {
+model_example_map: dict[str, Callable[[], RerankModelData]] = {
    "jinavl_reranker": run_jinavl_reranker,
    "qwen3_vl_reranker": run_qwen3_vl_reranker,
 }
@@ -93,78 +117,67 @@ def parse_args():
        choices=model_example_map.keys(),
        help="The name of the reranker model.",
    )
-    parser.add_argument(
-        "--modality",
-        type=str,
-        default="image",
-        choices=["image", "video"],
-        help="Modality of the multimodal input (image or video).",
-    )
    return parser.parse_args()


-def get_multi_modal_input(modality: str) -> tuple[str, ScoreMultiModalParam]:
-    # Sample query for testing the reranker
-    if modality == "image":
-        query = "A woman playing with her dog on a beach at sunset."
-        # Sample multimodal documents to be scored against the query
-        # Each document contains an image URL that will be fetched and processed
-        documents: ScoreMultiModalParam = {
-            "content": [
-                {
-                    "type": "text",
-                    "text": (
-                        "A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, "  # noqa: E501
-                        "as the dog offers its paw in a heartwarming display of companionship and trust."  # noqa: E501
-                    ),
-                },
-                {
-                    "type": "image_url",
-                    "image_url": {
-                        "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
-                    },
-                },
-            ]
-        }
-    elif modality == "video":
-        query = "A girl is drawing pictures on an ipad."
-        # Sample video documents to be scored against the query
-        documents: ScoreMultiModalParam = {
-            "content": [
-                {
-                    "type": "text",
-                    "text": "A girl is drawing a guitar on her ipad with Apple Pencil.",
-                },
-                {
-                    "type": "video_url",
-                    "video_url": {
-                        "url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
-                    },
-                },
-            ]
-        }
-    else:
-        raise ValueError(f"Unsupported modality: {modality}")
-    return query, documents
-
-
 def main(args: Namespace):
    # Run the selected reranker model
-    modality = args.modality
-    model_request = model_example_map[args.model_name](modality)
+    model_request = model_example_map[args.model_name]()
    engine_args = model_request.engine_args

    llm = LLM(**asdict(engine_args))

-    query, documents = get_multi_modal_input(modality)
-    outputs = llm.score(query, documents, chat_template=model_request.chat_template)
-
-    print("-" * 50)
-    print(f"Model: {engine_args.model}")
-    print(f"Modality: {modality}")
-    print(f"Query: {query}")
+    print("Query: string & Document: string")
+    outputs = llm.score(query, document)
+    print("Relevance scores:", [output.outputs.score for output in outputs])
+
+    print("Query: string & Document: text")
+    outputs = llm.score(
+        query, {"content": [documents[0]]}, chat_template=model_request.chat_template
+    )
+    print("Relevance scores:", [output.outputs.score for output in outputs])
+
+    print("Query: string & Document: image url")
+    outputs = llm.score(
+        query, {"content": [documents[1]]}, chat_template=model_request.chat_template
+    )
+    print("Relevance scores:", [output.outputs.score for output in outputs])
+
+    print("Query: string & Document: image base64")
+    outputs = llm.score(
+        query, {"content": [documents[2]]}, chat_template=model_request.chat_template
+    )
+    print("Relevance scores:", [output.outputs.score for output in outputs])
+
+    if "video" in model_request.modality:
+        print("Query: string & Document: video url")
+        outputs = llm.score(
+            query,
+            {"content": [documents[3]]},
+            chat_template=model_request.chat_template,
+        )
+        print("Relevance scores:", [output.outputs.score for output in outputs])
+
+    print("Query: string & Document: text + image url")
+    outputs = llm.score(
+        query,
+        {"content": [documents[0], documents[1]]},
+        chat_template=model_request.chat_template,
+    )
+    print("Relevance scores:", [output.outputs.score for output in outputs])
+
+    print("Query: string & Document: list")
+    outputs = llm.score(
+        query,
+        [
+            document,
+            {"content": [documents[0]]},
+            {"content": [documents[1]]},
+            {"content": [documents[0], documents[1]]},
+        ],
+        chat_template=model_request.chat_template,
+    )
    print("Relevance scores:", [output.outputs.score for output in outputs])
-    print("-" * 50)


 if __name__ == "__main__":
--- a/examples/pooling/score/vision_score_api_online.py
+++ b/examples/pooling/score/vision_score_api_online.py
@@ -29,6 +29,7 @@ document = (
    "as the dog offers its paw in a heartwarming display of companionship and trust."
 )
 image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
+video_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
 documents = [
    {
        "type": "text",
@@ -42,6 +43,10 @@ documents = [
        "type": "image_url",
        "image_url": {"url": encode_image_url(fetch_image(image_url))},
    },
+    {
+        "type": "video_url",
+        "video_url": {"url": video_url},
+    },
 ]


@@ -92,6 +97,15 @@ def main(args):
    response = requests.post(score_url, json=prompt)
    pprint.pprint(response.json())

+    print("Query: string & Document: video url")
+    prompt = {
+        "model": model,
+        "queries": query,
+        "documents": {"content": [documents[3]]},
+    }
+    response = requests.post(score_url, json=prompt)
+    pprint.pprint(response.json())
+
    print("Query: string & Document: text + image url")
    prompt = {
        "model": model,
--- a/tests/entrypoints/pooling/classify/test_offline.py
+++ b/tests/entrypoints/pooling/classify/test_offline.py
@@ -7,12 +7,15 @@ import pytest
 import torch

 from tests.models.utils import softmax
-from vllm import LLM, PoolingParams
+from vllm import LLM, ClassificationRequestOutput, PoolingParams, PoolingRequestOutput
 from vllm.distributed import cleanup_dist_env_and_memory
+from vllm.tasks import PoolingTask

 MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"

-prompts = ["The chef prepared a delicious meal."]
+prompt = "The chef prepared a delicious meal."
+prompt_token_ids = [785, 29706, 10030, 264, 17923, 15145, 13]
+num_labels = 2


@pytest.fixture(scope="module")
@@ -35,11 +38,48 @@ def llm():
    cleanup_dist_env_and_memory()


+@pytest.mark.skip_global_cleanup
+def test_str_prompts(llm: LLM):
+    outputs = llm.classify(prompt, use_tqdm=False)
+    assert len(outputs) == 1
+    assert isinstance(outputs[0], ClassificationRequestOutput)
+    assert outputs[0].prompt_token_ids == prompt_token_ids
+    assert len(outputs[0].outputs.probs) == num_labels
+
+
+@pytest.mark.skip_global_cleanup
+def test_token_ids_prompts(llm: LLM):
+    outputs = llm.classify([prompt_token_ids], use_tqdm=False)
+    assert len(outputs) == 1
+    assert isinstance(outputs[0], ClassificationRequestOutput)
+    assert outputs[0].prompt_token_ids == prompt_token_ids
+    assert len(outputs[0].outputs.probs) == num_labels
+
+
+@pytest.mark.skip_global_cleanup
+def test_list_prompts(llm: LLM):
+    outputs = llm.classify([prompt, prompt_token_ids], use_tqdm=False)
+    assert len(outputs) == 2
+    for i in range(len(outputs)):
+        assert isinstance(outputs[i], ClassificationRequestOutput)
+        assert outputs[i].prompt_token_ids == prompt_token_ids
+        assert len(outputs[i].outputs.probs) == num_labels
+
+
+@pytest.mark.skip_global_cleanup
+def test_token_classify(llm: LLM):
+    outputs = llm.encode(prompt, pooling_task="token_classify", use_tqdm=False)
+    assert len(outputs) == 1
+    assert isinstance(outputs[0], PoolingRequestOutput)
+    assert outputs[0].prompt_token_ids == prompt_token_ids
+    assert outputs[0].outputs.data.shape == (len(prompt_token_ids), num_labels)
+
+
@pytest.mark.skip_global_cleanup
 def test_pooling_params(llm: LLM):
    def get_outputs(use_activation):
        outputs = llm.classify(
-            prompts,
+            prompt,
            pooling_params=PoolingParams(use_activation=use_activation),
            use_tqdm=False,
        )
@@ -61,11 +101,14 @@ def test_pooling_params(llm: LLM):


@pytest.mark.skip_global_cleanup
-def test_token_classify(llm: LLM):
-    llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
-
-
 def test_score_api(llm: LLM):
    err_msg = "Score API is only enabled for num_labels == 1."
    with pytest.raises(ValueError, match=err_msg):
        llm.score("ping", "pong", use_tqdm=False)
+
+
+@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
+def test_unsupported_tasks(llm: LLM, task: PoolingTask):
+    err_msg = f"Unsupported task: '{task}' Supported tasks.+"
+    with pytest.raises(ValueError, match=err_msg):
+        llm.encode(prompt, pooling_task=task, use_tqdm=False)
--- a/tests/entrypoints/pooling/embed/test_online_vision.py
+++ b/tests/entrypoints/pooling/embed/test_online_vision.py
@@ -10,12 +10,12 @@ from transformers import AutoProcessor
 from tests.utils import VLLM_PATH, RemoteOpenAIServer
 from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
 from vllm.multimodal.media import MediaWithBytes
-from vllm.multimodal.utils import fetch_image
+from vllm.multimodal.utils import encode_image_url, fetch_image

 MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
 MAXIMUM_IMAGES = 2

-vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec_phi3v.jinja"
+vlm2vec_jinja_path = VLLM_PATH / "examples/pooling/embed/template/vlm2vec_phi3v.jinja"
 assert vlm2vec_jinja_path.exists()

 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
@@ -26,6 +26,10 @@ TEST_IMAGE_ASSETS = [
    "RGBA_comp.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
 ]

+input_text = "The best thing about vLLM is that it supports many different models"
+image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
+image_base64 = {"url": encode_image_url(fetch_image(image_url))}
+

@pytest.fixture(scope="module")
 def server():
@@ -48,6 +52,81 @@ def server():
        yield remote_server


+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_chat_text_request(server: RemoteOpenAIServer, model_name: str):
+    messages = [
+        {
+            "role": "user",
+            "content": input_text,
+        },
+    ]
+
+    # note: vlm2vec_phi3v.jinja
+    # Embedding models should only embed one message at a time.
+
+    response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={"model": model_name, "messages": messages},
+    )
+    response.raise_for_status()
+
+    output = EmbeddingResponse.model_validate(response.json())
+    assert len(output.data) == 1
+    assert output.model == MODEL_NAME
+    assert len(output.data[0].embedding) == 3072
+    assert output.usage.prompt_tokens == 14
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_chat_image_url_request(server: RemoteOpenAIServer, model_name: str):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Represent the user's input."},
+                {"type": "image_url", "image_url": {"url": image_url}},
+            ],
+        }
+    ]
+
+    response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={"model": model_name, "messages": messages},
+    )
+    response.raise_for_status()
+
+    output = EmbeddingResponse.model_validate(response.json())
+    assert len(output.data) == 1
+    assert output.model == MODEL_NAME
+    assert len(output.data[0].embedding) == 3072
+    assert output.usage.prompt_tokens == 767
+
+
+@pytest.mark.parametrize("model_name", [MODEL_NAME])
+def test_chat_image_base64_request(server: RemoteOpenAIServer, model_name: str):
+    messages = [
+        {
+            "role": "user",
+            "content": [
+                {"type": "text", "text": "Represent the user's input."},
+                {"type": "image_url", "image_url": image_base64},
+            ],
+        }
+    ]
+
+    response = requests.post(
+        server.url_for("v1/embeddings"),
+        json={"model": model_name, "messages": messages},
+    )
+    response.raise_for_status()
+
+    output = EmbeddingResponse.model_validate(response.json())
+    assert len(output.data) == 1
+    assert output.model == MODEL_NAME
+    assert len(output.data[0].embedding) == 3072
+    assert output.usage.prompt_tokens == 767
+
+
 def get_hf_prompt_tokens(model_name, content, image_url):
    processor = AutoProcessor.from_pretrained(
        model_name, trust_remote_code=True, num_crops=4
--- a/tests/renderers/test_hf.py
+++ b/tests/renderers/test_hf.py
@@ -428,13 +428,13 @@ def test_resolve_content_format_fallbacks(model, expected_format):
        ("template_chatglm.jinja", "string"),
        ("template_chatglm2.jinja", "string"),
        ("template_chatml.jinja", "string"),
-        ("template_dse_qwen2_vl.jinja", "openai"),
        ("template_falcon_180b.jinja", "string"),
        ("template_falcon.jinja", "string"),
        ("template_inkbot.jinja", "string"),
        ("template_teleflm.jinja", "string"),
-        ("template_vlm2vec_phi3v.jinja", "openai"),
-        ("template_vlm2vec_qwen2vl.jinja", "openai"),
+        ("pooling/embed/template/dse_qwen2_vl.jinja", "openai"),
+        ("pooling/embed/template/vlm2vec_phi3v.jinja", "openai"),
+        ("pooling/embed/template/vlm2vec_qwen2vl.jinja", "openai"),
        ("tool_chat_template_granite_20b_fc.jinja", "string"),
        ("tool_chat_template_hermes.jinja", "string"),
        ("tool_chat_template_internlm2_tool.jinja", "string"),
--- a/vllm/entrypoints/pooling/base/protocol.py
+++ b/vllm/entrypoints/pooling/base/protocol.py
@@ -40,6 +40,21 @@ class PoolingBasicRequestMixin(OpenAIBaseModel):
            "if the served model does not use priority scheduling."
        ),
    )
+    mm_processor_kwargs: dict[str, Any] | None = Field(
+        default=None,
+        description="Additional kwargs to pass to the HF processor.",
+    )
+    cache_salt: str | None = Field(
+        default=None,
+        description=(
+            "If specified, the prefix cache will be salted with the provided "
+            "string to prevent an attacker to guess prompts in multi-user "
+            "environments. The salt should be random, protected from "
+            "access by 3rd parties, and long enough to be "
+            "unpredictable (e.g., 43 characters base64-encoded, corresponding "
+            "to 256 bit)."
+        ),
+    )
    # --8<-- [end:pooling-common-extra-params]


--- a/vllm/entrypoints/pooling/classify/protocol.py
+++ b/vllm/entrypoints/pooling/classify/protocol.py
@@ -2,7 +2,7 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project

 import time
-from typing import Any, TypeAlias
+from typing import TypeAlias

 from pydantic import Field

@@ -48,12 +48,6 @@ class ClassificationCompletionRequest(
 class ClassificationChatRequest(
    PoolingBasicRequestMixin, ChatRequestMixin, ClassifyRequestMixin
 ):
-    # --8<-- [start:chat-classification-extra-params]
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-
    def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
        encoder_config = model_config.encoder_config or {}

--- a/vllm/entrypoints/pooling/embed/protocol.py
+++ b/vllm/entrypoints/pooling/embed/protocol.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import time
-from typing import Any, TypeAlias
+from typing import TypeAlias

 from pydantic import Field

@@ -78,11 +78,6 @@ class EmbeddingCompletionRequest(
 class EmbeddingChatRequest(
    PoolingBasicRequestMixin, ChatRequestMixin, EmbedRequestMixin
 ):
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-
    def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
        encoder_config = model_config.encoder_config or {}

--- a/vllm/entrypoints/pooling/pooling/protocol.py
+++ b/vllm/entrypoints/pooling/pooling/protocol.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import time
-from typing import Any, Generic, TypeAlias, TypeVar
+from typing import Generic, TypeAlias, TypeVar

 from pydantic import Field

@@ -65,11 +65,6 @@ class PoolingChatRequest(
 ):
    task: PoolingTask | None = None

-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-
    def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
        encoder_config = model_config.encoder_config or {}

--- a/vllm/entrypoints/pooling/score/protocol.py
+++ b/vllm/entrypoints/pooling/score/protocol.py
@@ -1,7 +1,7 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 import time
-from typing import Any, TypeAlias
+from typing import TypeAlias

 from pydantic import BaseModel, Field

@@ -23,13 +23,6 @@ from vllm.utils import random_uuid


 class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin):
-    # --8<-- [start:score-extra-params]
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-    # --8<-- [end:score-extra-params]
-
    def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
        encoder_config = model_config.encoder_config or {}

@@ -106,13 +99,6 @@ class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin):
    documents: ScoreInputs
    top_n: int = Field(default_factory=lambda: 0)

-    # --8<-- [start:rerank-extra-params]
-    mm_processor_kwargs: dict[str, Any] | None = Field(
-        default=None,
-        description=("Additional kwargs to pass to the HF processor."),
-    )
-    # --8<-- [end:rerank-extra-params]
-
    def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
        encoder_config = model_config.encoder_config or {}

--- a/vllm/utils/print_utils.py
+++ b/vllm/utils/print_utils.py
@@ -0,0 +1,7 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+
+def print_embeddings(embeds: list[float]):
+    embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
+    print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")