[Frontend][last/5] Make pooling entrypoints request schema consensus. (#31127)

Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
This commit is contained in:
wang.yuqi
2026-02-09 14:42:38 +08:00
committed by GitHub
parent 7c233dbb36
commit 22b64948f6
24 changed files with 659 additions and 613 deletions

View File

@@ -514,7 +514,7 @@ steps:
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
# for pooling models
- python3 pooling/pooling/vision_language_pooling.py --seed 0
- python3 pooling/embed/vision_embedding_offline.py --seed 0
# for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py

View File

@@ -453,7 +453,7 @@ steps:
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
# for pooling models
- python3 pooling/pooling/vision_language_pooling.py --seed 0
- python3 pooling/embed/vision_embedding_offline.py --seed 0
# for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py

View File

@@ -72,7 +72,7 @@ steps:
- python3 offline_inference/vision_language_multi_image.py --seed 0
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
# for pooling models
- python3 pooling/pooling/vision_language_pooling.py --seed 0
- python3 pooling/embed/vision_embedding_offline.py --seed 0
# for features demo
- python3 offline_inference/prefix_caching.py
- python3 offline_inference/llm_engine_example.py

View File

@@ -510,7 +510,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
For certain models, we provide alternative chat templates inside [examples](../../examples).
For example, VLM2Vec uses [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.
For example, VLM2Vec uses [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../examples/pooling/embed/template/vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.
### Image Inputs

View File

@@ -311,7 +311,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
--trust-remote-code \
--max-model-len 4096 \
--chat-template examples/template_vlm2vec_phi3v.jinja
--chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja
```
!!! important
@@ -319,7 +319,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
to run this model in embedding mode instead of text generation mode.
The custom chat template is completely different from the original one for this model,
and can be found here: [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja)
and can be found here: [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../examples/pooling/embed/template/vlm2vec_phi3v.jinja)
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
@@ -359,14 +359,14 @@ and passing a list of `messages` in the request. Refer to the examples below for
vllm serve MrLight/dse-qwen2-2b-mrl-v1 --runner pooling \
--trust-remote-code \
--max-model-len 8192 \
--chat-template examples/template_dse_qwen2_vl.jinja
--chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja
```
!!! important
Like with VLM2Vec, we have to explicitly pass `--runner pooling`.
Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
by a custom chat template: [examples/template_dse_qwen2_vl.jinja](../../examples/template_dse_qwen2_vl.jinja)
by a custom chat template: [examples/pooling/embed/template/dse_qwen2_vl.jinja](../../examples/pooling/embed/template/dse_qwen2_vl.jinja)
!!! important
`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
@@ -532,7 +532,7 @@ The following [sampling parameters](../api/README.md#inference-parameters) are s
??? code
```python
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
--8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:transcription-sampling-params"
```
The following extra parameters are supported:
@@ -540,7 +540,7 @@ The following extra parameters are supported:
??? code
```python
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
--8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:transcription-extra-params"
```
### Translations API
@@ -560,13 +560,13 @@ Code example: [examples/online_serving/openai_translation_client.py](../../examp
The following [sampling parameters](../api/README.md#inference-parameters) are supported.
```python
--8<-- "vllm/entrypoints/openai/protocol.py:translation-sampling-params"
--8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:translation-sampling-params"
```
The following extra parameters are supported:
```python
--8<-- "vllm/entrypoints/openai/protocol.py:translation-extra-params"
--8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:translation-extra-params"
```
### Realtime API
@@ -960,23 +960,29 @@ You can pass multi-modal inputs to scoring models by passing `content` including
json={
"model": "jinaai/jina-reranker-m0",
"queries": "slm markdown",
"documents": {
"documents": [
{
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
}
],
},
{
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
}
]
},
],
},
},
)
response.raise_for_status()
response_json = response.json()
@@ -1001,7 +1007,6 @@ The following Score API parameters are supported:
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
--8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
```
The following extra parameters are supported:
@@ -1009,7 +1014,6 @@ The following extra parameters are supported:
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
--8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
```
### Re-rank API
@@ -1092,7 +1096,6 @@ The following Re-rank API parameters are supported:
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
--8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
```
The following extra parameters are supported:
@@ -1100,7 +1103,6 @@ The following extra parameters are supported:
```python
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
--8<-- "vllm/entrypoints/pooling/score/protocol.py:rerank-extra-params"
```
## Ray Serve LLM

View File

@@ -0,0 +1,110 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# ruff: noqa: E501
"""Example Python client for multimodal classification API using vLLM API server
NOTE:
start a supported multimodal classification model server with `vllm serve`, e.g.
vllm serve muziyongshixin/Qwen2.5-VL-7B-for-VideoCls \
--runner pooling \
--max-model-len 5000 \
--limit-mm-per-prompt '{"video": 1}' \
--hf-overrides '{"text_config": {"architectures": ["Qwen2_5_VLForSequenceClassification"]}}'
"""
import argparse
import pprint
import requests
from vllm.multimodal.utils import encode_image_url, fetch_image
input_text = "This product was excellent and exceeded my expectations"
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
image_base64 = {"url": encode_image_url(fetch_image(image_url))}
video_url = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
def parse_args():
parse = argparse.ArgumentParser()
parse.add_argument("--host", type=str, default="localhost")
parse.add_argument("--port", type=int, default=8000)
return parse.parse_args()
def main(args):
base_url = f"http://{args.host}:{args.port}"
models_url = base_url + "/v1/models"
classify_url = base_url + "/classify"
response = requests.get(models_url)
model_name = response.json()["data"][0]["id"]
print("Text classification output:")
messages = [
{
"role": "assistant",
"content": "Please classify this text request.",
},
{
"role": "user",
"content": input_text,
},
]
response = requests.post(
classify_url,
json={"model": model_name, "messages": messages},
)
pprint.pprint(response.json())
print("Image url classification output:")
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Please classify this image."},
{"type": "image_url", "image_url": {"url": image_url}},
],
}
]
response = requests.post(
classify_url,
json={"model": model_name, "messages": messages},
)
pprint.pprint(response.json())
print("Image base64 classification output:")
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Please classify this image."},
{"type": "image_url", "image_url": image_base64},
],
}
]
response = requests.post(
classify_url,
json={"model": model_name, "messages": messages},
)
pprint.pprint(response.json())
print("Video url classification output:")
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Please classify this video."},
{"type": "video_url", "video_url": {"url": video_url}},
],
}
]
response = requests.post(
classify_url,
json={"model": model_name, "messages": messages},
)
pprint.pprint(response.json())
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@@ -11,23 +11,79 @@ on HuggingFace model repository.
import argparse
from dataclasses import asdict
from pathlib import Path
from PIL.Image import Image
from vllm import LLM, EngineArgs
from vllm.multimodal.utils import fetch_image
from vllm.utils.print_utils import print_embeddings
ROOT_DIR = Path(__file__).parent.parent.parent
EMBED_TEMPLATE_DIR = ROOT_DIR / "pooling/embed/template/"
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
text = "A cat standing in the snow."
multi_modal_data = {"image": fetch_image(image_url)}
def print_embeddings(embeds: list[float]):
embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
def run_clip(seed: int):
engine_args = EngineArgs(
model="openai/clip-vit-base-patch32",
runner="pooling",
limit_mm_per_prompt={"image": 1},
)
llm = LLM(**asdict(engine_args) | {"seed": seed})
print("Text embedding output:")
outputs = llm.embed(text, use_tqdm=False)
print_embeddings(outputs[0].outputs.embedding)
print("Image embedding output:")
prompt = "" # For image input, make sure that the prompt text is empty
outputs = llm.embed(
{
"prompt": prompt,
"multi_modal_data": multi_modal_data,
},
use_tqdm=False,
)
print_embeddings(outputs[0].outputs.embedding)
def run_qwen3_vl():
def run_e5_v(seed: int):
engine_args = EngineArgs(
model="royokong/e5-v",
runner="pooling",
max_model_len=4096,
limit_mm_per_prompt={"image": 1},
)
llm = LLM(**asdict(engine_args) | {"seed": seed})
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
print("Text embedding output:")
prompt_text = llama3_template.format(
f"{text}\nSummary above sentence in one word: "
)
outputs = llm.embed(prompt_text, use_tqdm=False)
print_embeddings(outputs[0].outputs.embedding)
print("Image embedding output:")
prompt_image = llama3_template.format("<image>\nSummary above image in one word: ")
outputs = llm.embed(
{
"prompt": prompt_image,
"multi_modal_data": multi_modal_data,
},
use_tqdm=False,
)
print_embeddings(outputs[0].outputs.embedding)
def run_qwen3_vl(seed: int):
try:
from qwen_vl_utils import smart_resize
except ModuleNotFoundError:
@@ -61,20 +117,20 @@ def run_qwen3_vl():
)
default_instruction = "Represent the user's input."
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
text_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
image_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
image_text_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"
prompt_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
prompt_image = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
prompt_image_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"
llm = LLM(**asdict(engine_args))
llm = LLM(**asdict(engine_args) | {"seed": seed})
print("Text embedding output:")
outputs = llm.embed(text_prompt, use_tqdm=False)
outputs = llm.embed(prompt_text, use_tqdm=False)
print_embeddings(outputs[0].outputs.embedding)
print("Image embedding output:")
outputs = llm.embed(
{
"prompt": image_prompt,
"prompt": prompt_image,
"multi_modal_data": multi_modal_data,
},
use_tqdm=False,
@@ -84,7 +140,162 @@ def run_qwen3_vl():
print("Image+Text embedding output:")
outputs = llm.embed(
{
"prompt": image_text_prompt,
"prompt": prompt_image_text,
"multi_modal_data": multi_modal_data,
},
use_tqdm=False,
)
print_embeddings(outputs[0].outputs.embedding)
def run_siglip(seed: int):
engine_args = EngineArgs(
model="google/siglip-base-patch16-224",
runner="pooling",
limit_mm_per_prompt={"image": 1},
)
llm = LLM(**asdict(engine_args) | {"seed": seed})
print("Text embedding output:")
outputs = llm.embed(text, use_tqdm=False)
print_embeddings(outputs[0].outputs.embedding)
print("Image embedding output:")
prompt = "" # For image input, make sure that the prompt text is empty
outputs = llm.embed(
{
"prompt": prompt,
"multi_modal_data": multi_modal_data,
},
use_tqdm=False,
)
print_embeddings(outputs[0].outputs.embedding)
def run_vlm2vec_phi3v(seed: int):
engine_args = EngineArgs(
model="TIGER-Lab/VLM2Vec-Full",
runner="pooling",
max_model_len=4096,
trust_remote_code=True,
mm_processor_kwargs={"num_crops": 4},
limit_mm_per_prompt={"image": 1},
)
llm = LLM(**asdict(engine_args) | {"seed": seed})
image_token = "<|image_1|>"
print("Text embedding output:")
prompt_text = f"Find me an everyday image that matches the given caption: {text}"
outputs = llm.embed(prompt_text, use_tqdm=False)
print_embeddings(outputs[0].outputs.embedding)
print("Image embedding output:")
prompt_image = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501
outputs = llm.embed(
{
"prompt": prompt_image,
"multi_modal_data": multi_modal_data,
},
use_tqdm=False,
)
print_embeddings(outputs[0].outputs.embedding)
print("Image+Text embedding output:")
prompt_image_text = (
f"{image_token} Represent the given image with the following question: {text}" # noqa: E501
)
outputs = llm.embed(
{
"prompt": prompt_image_text,
"multi_modal_data": multi_modal_data,
},
use_tqdm=False,
)
print_embeddings(outputs[0].outputs.embedding)
def run_vlm2vec_qwen2vl(seed: int):
# vLLM does not support LoRA adapters on multi-modal encoder,
# so we merge the weights first
from huggingface_hub.constants import HF_HUB_CACHE
from peft import PeftConfig, PeftModel
from transformers import AutoModelForImageTextToText, AutoProcessor
from vllm.entrypoints.chat_utils import load_chat_template
model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B"
base_model = AutoModelForImageTextToText.from_pretrained(model_id)
lora_model = PeftModel.from_pretrained(
base_model,
model_id,
config=PeftConfig.from_pretrained(model_id),
)
model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
model._hf_peft_config_loaded = False # Needed to save the merged model
processor = AutoProcessor.from_pretrained(
model_id,
# `min_pixels` and `max_pixels` are deprecated for
# transformers `preprocessor_config.json`
size={"shortest_edge": 3136, "longest_edge": 12845056},
)
processor.chat_template = load_chat_template(
# The original chat template is not correct
EMBED_TEMPLATE_DIR / "vlm2vec_qwen2vl.jinja",
)
merged_path = str(
Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm")
)
print(f"Saving merged model to {merged_path}...")
print(
"NOTE: This directory is not tracked by `huggingface_hub` "
"so you have to delete this manually if you don't want it anymore."
)
model.save_pretrained(merged_path)
processor.save_pretrained(merged_path)
print("Done!")
engine_args = EngineArgs(
model=merged_path,
runner="pooling",
max_model_len=4096,
mm_processor_kwargs={
"min_pixels": 3136,
"max_pixels": 12845056,
},
limit_mm_per_prompt={"image": 1},
)
llm = LLM(**asdict(engine_args) | {"seed": seed})
image_token = "<|image_pad|>"
print("Text embedding output:")
prompt_text = f"Find me an everyday image that matches the given caption: {text}"
outputs = llm.embed(prompt_text, use_tqdm=False)
print_embeddings(outputs[0].outputs.embedding)
print("Image embedding output:")
prompt_image = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501
outputs = llm.embed(
{
"prompt": prompt_image,
"multi_modal_data": multi_modal_data,
},
use_tqdm=False,
)
print_embeddings(outputs[0].outputs.embedding)
print("Image+Text embedding output:")
prompt_image_text = (
f"{image_token} Represent the given image with the following question: {text}" # noqa: E501
)
outputs = llm.embed(
{
"prompt": prompt_image_text,
"multi_modal_data": multi_modal_data,
},
use_tqdm=False,
@@ -93,7 +304,12 @@ def run_qwen3_vl():
model_example_map = {
"clip": run_clip,
"e5_v": run_e5_v,
"qwen3_vl": run_qwen3_vl,
"siglip": run_siglip,
"vlm2vec_phi3v": run_vlm2vec_phi3v,
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
}
@@ -103,16 +319,23 @@ def parse_args():
)
parser.add_argument(
"--model",
"-m",
type=str,
default="vlm2vec_phi3v",
choices=model_example_map.keys(),
required=True,
help="The name of the embedding model.",
)
parser.add_argument(
"--seed",
type=int,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)
return parser.parse_args()
def main(args):
model_example_map[args.model]()
model_example_map[args.model](args.seed)
if __name__ == "__main__":

View File

@@ -17,6 +17,8 @@ from openai.types.chat import ChatCompletionMessageParam
from openai.types.create_embedding_response import CreateEmbeddingResponse
from PIL import Image
from vllm.utils.print_utils import print_embeddings
# Modify OpenAI's API key and API base to use vLLM's API server.
openai_api_key = "EMPTY"
openai_api_base = "http://localhost:8000/v1"
@@ -51,11 +53,6 @@ def create_chat_embeddings(
)
def print_embeddings(embeds):
embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
def run_clip(client: OpenAI, model: str):
"""
Start the server using:
@@ -105,7 +102,7 @@ def run_dse_qwen2_vl(client: OpenAI, model: str):
--runner pooling \
--trust-remote-code \
--max-model-len 8192 \
--chat-template examples/template_dse_qwen2_vl.jinja
--chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja
"""
response = create_chat_embeddings(
client,
@@ -316,7 +313,7 @@ def run_vlm2vec(client: OpenAI, model: str):
--runner pooling \
--trust-remote-code \
--max-model-len 4096 \
--chat-template examples/template_vlm2vec_phi3v.jinja
--chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja
"""
response = create_chat_embeddings(

View File

@@ -1,441 +0,0 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
This example shows how to use vLLM for running offline inference with
the correct prompt format on vision language models for multimodal pooling.
For most models, the prompt format should follow corresponding examples
on HuggingFace model repository.
"""
from argparse import Namespace
from dataclasses import asdict
from pathlib import Path
from typing import Literal, NamedTuple, TypeAlias, TypedDict, get_args
from PIL.Image import Image
from vllm import LLM, EngineArgs
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
from vllm.multimodal.utils import fetch_image
from vllm.utils.argparse_utils import FlexibleArgumentParser
ROOT_DIR = Path(__file__).parent.parent.parent
EXAMPLES_DIR = ROOT_DIR / "examples"
class TextQuery(TypedDict):
modality: Literal["text"]
text: str
class ImageQuery(TypedDict):
modality: Literal["image"]
image: Image
class TextImageQuery(TypedDict):
modality: Literal["text+image"]
text: str
image: Image
class TextImagesQuery(TypedDict):
modality: Literal["text+images"]
text: str
image: ScoreMultiModalParam
QueryModality = Literal["text", "image", "text+image", "text+images"]
Query: TypeAlias = TextQuery | ImageQuery | TextImageQuery | TextImagesQuery
class ModelRequestData(NamedTuple):
engine_args: EngineArgs
prompt: str | None = None
image: Image | None = None
query: str | None = None
documents: ScoreMultiModalParam | None = None
def run_clip(query: Query) -> ModelRequestData:
if query["modality"] == "text":
prompt = query["text"]
image = None
elif query["modality"] == "image":
prompt = "" # For image input, make sure that the prompt text is empty
image = query["image"]
else:
modality = query["modality"]
raise ValueError(f"Unsupported query modality: '{modality}'")
engine_args = EngineArgs(
model="openai/clip-vit-base-patch32",
runner="pooling",
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image=image,
)
def run_e5_v(query: Query) -> ModelRequestData:
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
if query["modality"] == "text":
text = query["text"]
prompt = llama3_template.format(f"{text}\nSummary above sentence in one word: ")
image = None
elif query["modality"] == "image":
prompt = llama3_template.format("<image>\nSummary above image in one word: ")
image = query["image"]
else:
modality = query["modality"]
raise ValueError(f"Unsupported query modality: '{modality}'")
engine_args = EngineArgs(
model="royokong/e5-v",
runner="pooling",
max_model_len=4096,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image=image,
)
def run_jinavl_reranker(query: Query) -> ModelRequestData:
if query["modality"] != "text+images":
raise ValueError(f"Unsupported query modality: '{query['modality']}'")
engine_args = EngineArgs(
model="jinaai/jina-reranker-m0",
runner="pooling",
max_model_len=32768,
trust_remote_code=True,
mm_processor_kwargs={
"min_pixels": 3136,
"max_pixels": 602112,
},
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
query=query["text"],
documents=query["image"],
)
def run_qwen3_vl(query: Query) -> ModelRequestData:
image_placeholder = "<vision_start><|image_pad|><vision_end>"
if query["modality"] == "text":
prompt = query["text"]
image = None
elif query["modality"] == "image":
prompt = image_placeholder
image = query["image"]
elif query["modality"] == "text+image":
text = query["text"]
prompt = f"{image_placeholder}\n{text}"
image = query["image"]
else:
modality = query["modality"]
raise ValueError(f"Unsupported query modality: '{modality}'")
engine_args = EngineArgs(
model="Qwen/Qwen3-VL-Embedding-2B",
runner="pooling",
max_model_len=8192,
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image=image,
)
def run_siglip(query: Query) -> ModelRequestData:
if query["modality"] == "text":
prompt = query["text"]
image = None
elif query["modality"] == "image":
prompt = "" # For image input, make sure that the prompt text is empty
image = query["image"]
else:
modality = query["modality"]
raise ValueError(f"Unsupported query modality: '{modality}'")
engine_args = EngineArgs(
model="google/siglip-base-patch16-224",
runner="pooling",
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image=image,
)
def _get_vlm2vec_prompt_image(query: Query, image_token: str):
if query["modality"] == "text":
text = query["text"]
prompt = f"Find me an everyday image that matches the given caption: {text}"
image = None
elif query["modality"] == "image":
prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501
image = query["image"]
elif query["modality"] == "text+image":
text = query["text"]
prompt = f"{image_token} Represent the given image with the following question: {text}" # noqa: E501
image = query["image"]
else:
modality = query["modality"]
raise ValueError(f"Unsupported query modality: {modality!r}")
return prompt, image
def run_vlm2vec_phi3v(query: Query) -> ModelRequestData:
prompt, image = _get_vlm2vec_prompt_image(query, "<|image_1|>")
engine_args = EngineArgs(
model="TIGER-Lab/VLM2Vec-Full",
runner="pooling",
max_model_len=4096,
trust_remote_code=True,
mm_processor_kwargs={"num_crops": 4},
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image=image,
)
def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
# vLLM does not support LoRA adapters on multi-modal encoder,
# so we merge the weights first
from huggingface_hub.constants import HF_HUB_CACHE
from peft import PeftConfig, PeftModel
from transformers import AutoModelForImageTextToText, AutoProcessor
from vllm.entrypoints.chat_utils import load_chat_template
model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B"
base_model = AutoModelForImageTextToText.from_pretrained(model_id)
lora_model = PeftModel.from_pretrained(
base_model,
model_id,
config=PeftConfig.from_pretrained(model_id),
)
model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
model._hf_peft_config_loaded = False # Needed to save the merged model
processor = AutoProcessor.from_pretrained(
model_id,
# `min_pixels` and `max_pixels` are deprecated for
# transformers `preprocessor_config.json`
size={"shortest_edge": 3136, "longest_edge": 12845056},
)
processor.chat_template = load_chat_template(
# The original chat template is not correct
EXAMPLES_DIR / "template_vlm2vec_qwen2vl.jinja",
)
merged_path = str(
Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm")
)
print(f"Saving merged model to {merged_path}...")
print(
"NOTE: This directory is not tracked by `huggingface_hub` "
"so you have to delete this manually if you don't want it anymore."
)
model.save_pretrained(merged_path)
processor.save_pretrained(merged_path)
print("Done!")
prompt, image = _get_vlm2vec_prompt_image(query, "<|image_pad|>")
engine_args = EngineArgs(
model=merged_path,
runner="pooling",
max_model_len=4096,
mm_processor_kwargs={
"min_pixels": 3136,
"max_pixels": 12845056,
},
limit_mm_per_prompt={"image": 1},
)
return ModelRequestData(
engine_args=engine_args,
prompt=prompt,
image=image,
)
def get_query(modality: QueryModality):
if modality == "text":
return TextQuery(modality="text", text="A dog sitting in the grass")
if modality == "image":
return ImageQuery(
modality="image",
image=fetch_image(
"https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/eskimo.jpg" # noqa: E501
),
)
if modality == "text+image":
return TextImageQuery(
modality="text+image",
text="A cat standing in the snow.",
image=fetch_image(
"https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg" # noqa: E501
),
)
if modality == "text+images":
return TextImagesQuery(
modality="text+images",
text="slm markdown",
image={
"content": [
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
},
},
{
"type": "image_url",
"image_url": {
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
},
},
]
},
)
msg = f"Modality {modality} is not supported."
raise ValueError(msg)
def run_encode(model: str, modality: QueryModality, seed: int):
query = get_query(modality)
req_data = model_example_map[model](query)
# Disable other modalities to save memory
default_limits = {"image": 0, "video": 0, "audio": 0}
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
req_data.engine_args.limit_mm_per_prompt or {}
)
engine_args = asdict(req_data.engine_args) | {"seed": seed}
llm = LLM(**engine_args)
mm_data = {}
if req_data.image is not None:
mm_data["image"] = req_data.image
outputs = llm.embed(
{
"prompt": req_data.prompt,
"multi_modal_data": mm_data,
}
)
print("-" * 50)
for output in outputs:
print(output.outputs.embedding)
print("-" * 50)
def run_score(model: str, modality: QueryModality, seed: int):
query = get_query(modality)
req_data = model_example_map[model](query)
engine_args = asdict(req_data.engine_args) | {"seed": seed}
llm = LLM(**engine_args)
outputs = llm.score(req_data.query, req_data.documents)
print("-" * 30)
print([output.outputs.score for output in outputs])
print("-" * 30)
model_example_map = {
"clip": run_clip,
"e5_v": run_e5_v,
"jinavl_reranker": run_jinavl_reranker,
"qwen3_vl": run_qwen3_vl,
"siglip": run_siglip,
"vlm2vec_phi3v": run_vlm2vec_phi3v,
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
}
def parse_args():
parser = FlexibleArgumentParser(
description="Demo on using vLLM for offline inference with "
"vision language models for multimodal pooling tasks."
)
parser.add_argument(
"--model-name",
"-m",
type=str,
default="vlm2vec_phi3v",
choices=model_example_map.keys(),
help="The name of the embedding model.",
)
parser.add_argument(
"--task",
"-t",
type=str,
default="embedding",
choices=["embedding", "scoring"],
help="The task type.",
)
parser.add_argument(
"--modality",
type=str,
default="image",
choices=get_args(QueryModality),
help="Modality of the input.",
)
parser.add_argument(
"--seed",
type=int,
default=0,
help="Set the seed when initializing `vllm.LLM`.",
)
return parser.parse_args()
def main(args: Namespace):
if args.task == "embedding":
run_encode(args.model_name, args.modality, args.seed)
elif args.task == "scoring":
run_score(args.model_name, args.modality, args.seed)
else:
raise ValueError(f"Unsupported task: {args.task}")
if __name__ == "__main__":
args = parse_args()
main(args)

View File

@@ -30,6 +30,7 @@ document = (
"as the dog offers its paw in a heartwarming display of companionship and trust."
)
image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
video_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
documents = [
{
"type": "text",
@@ -43,6 +44,10 @@ documents = [
"type": "image_url",
"image_url": {"url": encode_image_url(fetch_image(image_url))},
},
{
"type": "video_url",
"video_url": {"url": video_url},
},
]
@@ -89,6 +94,15 @@ def main(args):
response = requests.post(rerank_url, json=prompt)
pprint.pprint(response.json())
print("Query: string & Document: video url")
prompt = {
"model": model,
"query": query,
"documents": {"content": [documents[3]]},
}
response = requests.post(rerank_url, json=prompt)
pprint.pprint(response.json())
print("Query: string & Document: text + image url")
prompt = {
"model": model,

View File

@@ -15,20 +15,47 @@ from pathlib import Path
from typing import NamedTuple
from vllm import LLM, EngineArgs
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
from vllm.multimodal.utils import encode_image_url, fetch_image
from vllm.utils.argparse_utils import FlexibleArgumentParser
TEMPLATE_HOME = Path(__file__).parent / "template"
query = "A woman playing with her dog on a beach at sunset."
document = (
"A woman shares a joyful moment with her golden retriever on a sun-drenched "
"beach at sunset, as the dog offers its paw in a heartwarming display of "
"companionship and trust."
)
image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
video_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
documents = [
{
"type": "text",
"text": document,
},
{
"type": "image_url",
"image_url": {"url": image_url},
},
{
"type": "image_url",
"image_url": {"url": encode_image_url(fetch_image(image_url))},
},
{
"type": "video_url",
"video_url": {"url": video_url},
},
]
class RerankModelData(NamedTuple):
engine_args: EngineArgs
chat_template: str | None = None
modality: set[str] = {}
def run_jinavl_reranker(modality: str) -> RerankModelData:
assert modality == "image"
def run_jinavl_reranker() -> RerankModelData:
engine_args = EngineArgs(
model="jinaai/jina-reranker-m0",
runner="pooling",
@@ -38,19 +65,15 @@ def run_jinavl_reranker(modality: str) -> RerankModelData:
"min_pixels": 3136,
"max_pixels": 602112,
},
limit_mm_per_prompt={modality: 1},
)
return RerankModelData(
engine_args=engine_args,
)
return RerankModelData(engine_args=engine_args, modality={"image"})
def run_qwen3_vl_reranker(modality: str) -> RerankModelData:
def run_qwen3_vl_reranker() -> RerankModelData:
engine_args = EngineArgs(
model="Qwen/Qwen3-VL-Reranker-2B",
runner="pooling",
max_model_len=16384,
limit_mm_per_prompt={modality: 1},
# HuggingFace model configuration overrides required for compatibility
hf_overrides={
# Manually route to sequence classification architecture
@@ -71,10 +94,11 @@ def run_qwen3_vl_reranker(modality: str) -> RerankModelData:
return RerankModelData(
engine_args=engine_args,
chat_template=chat_template,
modality={"image", "video"},
)
model_example_map: dict[str, Callable[[str], RerankModelData]] = {
model_example_map: dict[str, Callable[[], RerankModelData]] = {
"jinavl_reranker": run_jinavl_reranker,
"qwen3_vl_reranker": run_qwen3_vl_reranker,
}
@@ -93,78 +117,67 @@ def parse_args():
choices=model_example_map.keys(),
help="The name of the reranker model.",
)
parser.add_argument(
"--modality",
type=str,
default="image",
choices=["image", "video"],
help="Modality of the multimodal input (image or video).",
)
return parser.parse_args()
def get_multi_modal_input(modality: str) -> tuple[str, ScoreMultiModalParam]:
# Sample query for testing the reranker
if modality == "image":
query = "A woman playing with her dog on a beach at sunset."
# Sample multimodal documents to be scored against the query
# Each document contains an image URL that will be fetched and processed
documents: ScoreMultiModalParam = {
"content": [
{
"type": "text",
"text": (
"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, " # noqa: E501
"as the dog offers its paw in a heartwarming display of companionship and trust." # noqa: E501
),
},
{
"type": "image_url",
"image_url": {
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
},
},
]
}
elif modality == "video":
query = "A girl is drawing pictures on an ipad."
# Sample video documents to be scored against the query
documents: ScoreMultiModalParam = {
"content": [
{
"type": "text",
"text": "A girl is drawing a guitar on her ipad with Apple Pencil.",
},
{
"type": "video_url",
"video_url": {
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
},
},
]
}
else:
raise ValueError(f"Unsupported modality: {modality}")
return query, documents
def main(args: Namespace):
# Run the selected reranker model
modality = args.modality
model_request = model_example_map[args.model_name](modality)
model_request = model_example_map[args.model_name]()
engine_args = model_request.engine_args
llm = LLM(**asdict(engine_args))
query, documents = get_multi_modal_input(modality)
outputs = llm.score(query, documents, chat_template=model_request.chat_template)
print("-" * 50)
print(f"Model: {engine_args.model}")
print(f"Modality: {modality}")
print(f"Query: {query}")
print("Query: string & Document: string")
outputs = llm.score(query, document)
print("Relevance scores:", [output.outputs.score for output in outputs])
print("Query: string & Document: text")
outputs = llm.score(
query, {"content": [documents[0]]}, chat_template=model_request.chat_template
)
print("Relevance scores:", [output.outputs.score for output in outputs])
print("Query: string & Document: image url")
outputs = llm.score(
query, {"content": [documents[1]]}, chat_template=model_request.chat_template
)
print("Relevance scores:", [output.outputs.score for output in outputs])
print("Query: string & Document: image base64")
outputs = llm.score(
query, {"content": [documents[2]]}, chat_template=model_request.chat_template
)
print("Relevance scores:", [output.outputs.score for output in outputs])
if "video" in model_request.modality:
print("Query: string & Document: video url")
outputs = llm.score(
query,
{"content": [documents[3]]},
chat_template=model_request.chat_template,
)
print("Relevance scores:", [output.outputs.score for output in outputs])
print("Query: string & Document: text + image url")
outputs = llm.score(
query,
{"content": [documents[0], documents[1]]},
chat_template=model_request.chat_template,
)
print("Relevance scores:", [output.outputs.score for output in outputs])
print("Query: string & Document: list")
outputs = llm.score(
query,
[
document,
{"content": [documents[0]]},
{"content": [documents[1]]},
{"content": [documents[0], documents[1]]},
],
chat_template=model_request.chat_template,
)
print("Relevance scores:", [output.outputs.score for output in outputs])
print("-" * 50)
if __name__ == "__main__":

View File

@@ -29,6 +29,7 @@ document = (
"as the dog offers its paw in a heartwarming display of companionship and trust."
)
image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
video_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
documents = [
{
"type": "text",
@@ -42,6 +43,10 @@ documents = [
"type": "image_url",
"image_url": {"url": encode_image_url(fetch_image(image_url))},
},
{
"type": "video_url",
"video_url": {"url": video_url},
},
]
@@ -92,6 +97,15 @@ def main(args):
response = requests.post(score_url, json=prompt)
pprint.pprint(response.json())
print("Query: string & Document: video url")
prompt = {
"model": model,
"queries": query,
"documents": {"content": [documents[3]]},
}
response = requests.post(score_url, json=prompt)
pprint.pprint(response.json())
print("Query: string & Document: text + image url")
prompt = {
"model": model,

View File

@@ -7,12 +7,15 @@ import pytest
import torch
from tests.models.utils import softmax
from vllm import LLM, PoolingParams
from vllm import LLM, ClassificationRequestOutput, PoolingParams, PoolingRequestOutput
from vllm.distributed import cleanup_dist_env_and_memory
from vllm.tasks import PoolingTask
MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
prompts = ["The chef prepared a delicious meal."]
prompt = "The chef prepared a delicious meal."
prompt_token_ids = [785, 29706, 10030, 264, 17923, 15145, 13]
num_labels = 2
@pytest.fixture(scope="module")
@@ -35,11 +38,48 @@ def llm():
cleanup_dist_env_and_memory()
@pytest.mark.skip_global_cleanup
def test_str_prompts(llm: LLM):
outputs = llm.classify(prompt, use_tqdm=False)
assert len(outputs) == 1
assert isinstance(outputs[0], ClassificationRequestOutput)
assert outputs[0].prompt_token_ids == prompt_token_ids
assert len(outputs[0].outputs.probs) == num_labels
@pytest.mark.skip_global_cleanup
def test_token_ids_prompts(llm: LLM):
outputs = llm.classify([prompt_token_ids], use_tqdm=False)
assert len(outputs) == 1
assert isinstance(outputs[0], ClassificationRequestOutput)
assert outputs[0].prompt_token_ids == prompt_token_ids
assert len(outputs[0].outputs.probs) == num_labels
@pytest.mark.skip_global_cleanup
def test_list_prompts(llm: LLM):
outputs = llm.classify([prompt, prompt_token_ids], use_tqdm=False)
assert len(outputs) == 2
for i in range(len(outputs)):
assert isinstance(outputs[i], ClassificationRequestOutput)
assert outputs[i].prompt_token_ids == prompt_token_ids
assert len(outputs[i].outputs.probs) == num_labels
@pytest.mark.skip_global_cleanup
def test_token_classify(llm: LLM):
outputs = llm.encode(prompt, pooling_task="token_classify", use_tqdm=False)
assert len(outputs) == 1
assert isinstance(outputs[0], PoolingRequestOutput)
assert outputs[0].prompt_token_ids == prompt_token_ids
assert outputs[0].outputs.data.shape == (len(prompt_token_ids), num_labels)
@pytest.mark.skip_global_cleanup
def test_pooling_params(llm: LLM):
def get_outputs(use_activation):
outputs = llm.classify(
prompts,
prompt,
pooling_params=PoolingParams(use_activation=use_activation),
use_tqdm=False,
)
@@ -61,11 +101,14 @@ def test_pooling_params(llm: LLM):
@pytest.mark.skip_global_cleanup
def test_token_classify(llm: LLM):
llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
def test_score_api(llm: LLM):
err_msg = "Score API is only enabled for num_labels == 1."
with pytest.raises(ValueError, match=err_msg):
llm.score("ping", "pong", use_tqdm=False)
@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
def test_unsupported_tasks(llm: LLM, task: PoolingTask):
err_msg = f"Unsupported task: '{task}' Supported tasks.+"
with pytest.raises(ValueError, match=err_msg):
llm.encode(prompt, pooling_task=task, use_tqdm=False)

View File

@@ -10,12 +10,12 @@ from transformers import AutoProcessor
from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.multimodal.media import MediaWithBytes
from vllm.multimodal.utils import fetch_image
from vllm.multimodal.utils import encode_image_url, fetch_image
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
MAXIMUM_IMAGES = 2
vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec_phi3v.jinja"
vlm2vec_jinja_path = VLLM_PATH / "examples/pooling/embed/template/vlm2vec_phi3v.jinja"
assert vlm2vec_jinja_path.exists()
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
@@ -26,6 +26,10 @@ TEST_IMAGE_ASSETS = [
"RGBA_comp.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
]
input_text = "The best thing about vLLM is that it supports many different models"
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
image_base64 = {"url": encode_image_url(fetch_image(image_url))}
@pytest.fixture(scope="module")
def server():
@@ -48,6 +52,81 @@ def server():
yield remote_server
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_chat_text_request(server: RemoteOpenAIServer, model_name: str):
messages = [
{
"role": "user",
"content": input_text,
},
]
# note: vlm2vec_phi3v.jinja
# Embedding models should only embed one message at a time.
response = requests.post(
server.url_for("v1/embeddings"),
json={"model": model_name, "messages": messages},
)
response.raise_for_status()
output = EmbeddingResponse.model_validate(response.json())
assert len(output.data) == 1
assert output.model == MODEL_NAME
assert len(output.data[0].embedding) == 3072
assert output.usage.prompt_tokens == 14
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_chat_image_url_request(server: RemoteOpenAIServer, model_name: str):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Represent the user's input."},
{"type": "image_url", "image_url": {"url": image_url}},
],
}
]
response = requests.post(
server.url_for("v1/embeddings"),
json={"model": model_name, "messages": messages},
)
response.raise_for_status()
output = EmbeddingResponse.model_validate(response.json())
assert len(output.data) == 1
assert output.model == MODEL_NAME
assert len(output.data[0].embedding) == 3072
assert output.usage.prompt_tokens == 767
@pytest.mark.parametrize("model_name", [MODEL_NAME])
def test_chat_image_base64_request(server: RemoteOpenAIServer, model_name: str):
messages = [
{
"role": "user",
"content": [
{"type": "text", "text": "Represent the user's input."},
{"type": "image_url", "image_url": image_base64},
],
}
]
response = requests.post(
server.url_for("v1/embeddings"),
json={"model": model_name, "messages": messages},
)
response.raise_for_status()
output = EmbeddingResponse.model_validate(response.json())
assert len(output.data) == 1
assert output.model == MODEL_NAME
assert len(output.data[0].embedding) == 3072
assert output.usage.prompt_tokens == 767
def get_hf_prompt_tokens(model_name, content, image_url):
processor = AutoProcessor.from_pretrained(
model_name, trust_remote_code=True, num_crops=4

View File

@@ -428,13 +428,13 @@ def test_resolve_content_format_fallbacks(model, expected_format):
("template_chatglm.jinja", "string"),
("template_chatglm2.jinja", "string"),
("template_chatml.jinja", "string"),
("template_dse_qwen2_vl.jinja", "openai"),
("template_falcon_180b.jinja", "string"),
("template_falcon.jinja", "string"),
("template_inkbot.jinja", "string"),
("template_teleflm.jinja", "string"),
("template_vlm2vec_phi3v.jinja", "openai"),
("template_vlm2vec_qwen2vl.jinja", "openai"),
("pooling/embed/template/dse_qwen2_vl.jinja", "openai"),
("pooling/embed/template/vlm2vec_phi3v.jinja", "openai"),
("pooling/embed/template/vlm2vec_qwen2vl.jinja", "openai"),
("tool_chat_template_granite_20b_fc.jinja", "string"),
("tool_chat_template_hermes.jinja", "string"),
("tool_chat_template_internlm2_tool.jinja", "string"),

View File

@@ -40,6 +40,21 @@ class PoolingBasicRequestMixin(OpenAIBaseModel):
"if the served model does not use priority scheduling."
),
)
mm_processor_kwargs: dict[str, Any] | None = Field(
default=None,
description="Additional kwargs to pass to the HF processor.",
)
cache_salt: str | None = Field(
default=None,
description=(
"If specified, the prefix cache will be salted with the provided "
"string to prevent an attacker to guess prompts in multi-user "
"environments. The salt should be random, protected from "
"access by 3rd parties, and long enough to be "
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
"to 256 bit)."
),
)
# --8<-- [end:pooling-common-extra-params]

View File

@@ -2,7 +2,7 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time
from typing import Any, TypeAlias
from typing import TypeAlias
from pydantic import Field
@@ -48,12 +48,6 @@ class ClassificationCompletionRequest(
class ClassificationChatRequest(
PoolingBasicRequestMixin, ChatRequestMixin, ClassifyRequestMixin
):
# --8<-- [start:chat-classification-extra-params]
mm_processor_kwargs: dict[str, Any] | None = Field(
default=None,
description=("Additional kwargs to pass to the HF processor."),
)
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
encoder_config = model_config.encoder_config or {}

View File

@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time
from typing import Any, TypeAlias
from typing import TypeAlias
from pydantic import Field
@@ -78,11 +78,6 @@ class EmbeddingCompletionRequest(
class EmbeddingChatRequest(
PoolingBasicRequestMixin, ChatRequestMixin, EmbedRequestMixin
):
mm_processor_kwargs: dict[str, Any] | None = Field(
default=None,
description=("Additional kwargs to pass to the HF processor."),
)
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
encoder_config = model_config.encoder_config or {}

View File

@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time
from typing import Any, Generic, TypeAlias, TypeVar
from typing import Generic, TypeAlias, TypeVar
from pydantic import Field
@@ -65,11 +65,6 @@ class PoolingChatRequest(
):
task: PoolingTask | None = None
mm_processor_kwargs: dict[str, Any] | None = Field(
default=None,
description=("Additional kwargs to pass to the HF processor."),
)
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
encoder_config = model_config.encoder_config or {}

View File

@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import time
from typing import Any, TypeAlias
from typing import TypeAlias
from pydantic import BaseModel, Field
@@ -23,13 +23,6 @@ from vllm.utils import random_uuid
class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin):
# --8<-- [start:score-extra-params]
mm_processor_kwargs: dict[str, Any] | None = Field(
default=None,
description=("Additional kwargs to pass to the HF processor."),
)
# --8<-- [end:score-extra-params]
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
encoder_config = model_config.encoder_config or {}
@@ -106,13 +99,6 @@ class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin):
documents: ScoreInputs
top_n: int = Field(default_factory=lambda: 0)
# --8<-- [start:rerank-extra-params]
mm_processor_kwargs: dict[str, Any] | None = Field(
default=None,
description=("Additional kwargs to pass to the HF processor."),
)
# --8<-- [end:rerank-extra-params]
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
encoder_config = model_config.encoder_config or {}

View File

@@ -0,0 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
def print_embeddings(embeds: list[float]):
embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")