[Frontend][last/5] Make pooling entrypoints request schema consensus. (#31127)
Signed-off-by: wang.yuqi <yuqi.wang@daocloud.io>
This commit is contained in:
@@ -514,7 +514,7 @@ steps:
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
|
||||
@@ -453,7 +453,7 @@ steps:
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
|
||||
@@ -72,7 +72,7 @@ steps:
|
||||
- python3 offline_inference/vision_language_multi_image.py --seed 0
|
||||
- python3 offline_inference/encoder_decoder_multimodal.py --model-type whisper --seed 0
|
||||
# for pooling models
|
||||
- python3 pooling/pooling/vision_language_pooling.py --seed 0
|
||||
- python3 pooling/embed/vision_embedding_offline.py --seed 0
|
||||
# for features demo
|
||||
- python3 offline_inference/prefix_caching.py
|
||||
- python3 offline_inference/llm_engine_example.py
|
||||
|
||||
@@ -510,7 +510,7 @@ Our OpenAI-compatible server accepts multi-modal data via the [Chat Completions
|
||||
If no fallback is available, an error is raised and you have to provide the chat template manually via the `--chat-template` argument.
|
||||
|
||||
For certain models, we provide alternative chat templates inside [examples](../../examples).
|
||||
For example, VLM2Vec uses [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.
|
||||
For example, VLM2Vec uses [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../examples/pooling/embed/template/vlm2vec_phi3v.jinja) which is different from the default one for Phi-3-Vision.
|
||||
|
||||
### Image Inputs
|
||||
|
||||
|
||||
@@ -311,7 +311,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
|
||||
vllm serve TIGER-Lab/VLM2Vec-Full --runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 4096 \
|
||||
--chat-template examples/template_vlm2vec_phi3v.jinja
|
||||
--chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja
|
||||
```
|
||||
|
||||
!!! important
|
||||
@@ -319,7 +319,7 @@ and passing a list of `messages` in the request. Refer to the examples below for
|
||||
to run this model in embedding mode instead of text generation mode.
|
||||
|
||||
The custom chat template is completely different from the original one for this model,
|
||||
and can be found here: [examples/template_vlm2vec_phi3v.jinja](../../examples/template_vlm2vec_phi3v.jinja)
|
||||
and can be found here: [examples/pooling/embed/template/vlm2vec_phi3v.jinja](../../examples/pooling/embed/template/vlm2vec_phi3v.jinja)
|
||||
|
||||
Since the request schema is not defined by OpenAI client, we post a request to the server using the lower-level `requests` library:
|
||||
|
||||
@@ -359,14 +359,14 @@ and passing a list of `messages` in the request. Refer to the examples below for
|
||||
vllm serve MrLight/dse-qwen2-2b-mrl-v1 --runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 8192 \
|
||||
--chat-template examples/template_dse_qwen2_vl.jinja
|
||||
--chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja
|
||||
```
|
||||
|
||||
!!! important
|
||||
Like with VLM2Vec, we have to explicitly pass `--runner pooling`.
|
||||
|
||||
Additionally, `MrLight/dse-qwen2-2b-mrl-v1` requires an EOS token for embeddings, which is handled
|
||||
by a custom chat template: [examples/template_dse_qwen2_vl.jinja](../../examples/template_dse_qwen2_vl.jinja)
|
||||
by a custom chat template: [examples/pooling/embed/template/dse_qwen2_vl.jinja](../../examples/pooling/embed/template/dse_qwen2_vl.jinja)
|
||||
|
||||
!!! important
|
||||
`MrLight/dse-qwen2-2b-mrl-v1` requires a placeholder image of the minimum image size for text query embeddings. See the full code
|
||||
@@ -532,7 +532,7 @@ The following [sampling parameters](../api/README.md#inference-parameters) are s
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-sampling-params"
|
||||
--8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:transcription-sampling-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
@@ -540,7 +540,7 @@ The following extra parameters are supported:
|
||||
??? code
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:transcription-extra-params"
|
||||
--8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:transcription-extra-params"
|
||||
```
|
||||
|
||||
### Translations API
|
||||
@@ -560,13 +560,13 @@ Code example: [examples/online_serving/openai_translation_client.py](../../examp
|
||||
The following [sampling parameters](../api/README.md#inference-parameters) are supported.
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:translation-sampling-params"
|
||||
--8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:translation-sampling-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/openai/protocol.py:translation-extra-params"
|
||||
--8<-- "vllm/entrypoints/openai/speech_to_text/protocol.py:translation-extra-params"
|
||||
```
|
||||
|
||||
### Realtime API
|
||||
@@ -960,22 +960,28 @@ You can pass multi-modal inputs to scoring models by passing `content` including
|
||||
json={
|
||||
"model": "jinaai/jina-reranker-m0",
|
||||
"queries": "slm markdown",
|
||||
"documents": {
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||
},
|
||||
},
|
||||
],
|
||||
},
|
||||
"documents": [
|
||||
{
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||
},
|
||||
}
|
||||
],
|
||||
},
|
||||
{
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||
},
|
||||
}
|
||||
]
|
||||
},
|
||||
],
|
||||
},
|
||||
)
|
||||
response.raise_for_status()
|
||||
@@ -1001,7 +1007,6 @@ The following Score API parameters are supported:
|
||||
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
|
||||
--8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
@@ -1009,7 +1014,6 @@ The following extra parameters are supported:
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
|
||||
```
|
||||
|
||||
### Re-rank API
|
||||
@@ -1092,7 +1096,6 @@ The following Re-rank API parameters are supported:
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/score/protocol.py:score-extra-params"
|
||||
```
|
||||
|
||||
The following extra parameters are supported:
|
||||
@@ -1100,7 +1103,6 @@ The following extra parameters are supported:
|
||||
```python
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:pooling-common-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/base/protocol.py:classify-extra-params"
|
||||
--8<-- "vllm/entrypoints/pooling/score/protocol.py:rerank-extra-params"
|
||||
```
|
||||
|
||||
## Ray Serve LLM
|
||||
|
||||
110
examples/pooling/classify/vision_classification_online.py
Normal file
110
examples/pooling/classify/vision_classification_online.py
Normal file
@@ -0,0 +1,110 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# ruff: noqa: E501
|
||||
"""Example Python client for multimodal classification API using vLLM API server
|
||||
NOTE:
|
||||
start a supported multimodal classification model server with `vllm serve`, e.g.
|
||||
vllm serve muziyongshixin/Qwen2.5-VL-7B-for-VideoCls \
|
||||
--runner pooling \
|
||||
--max-model-len 5000 \
|
||||
--limit-mm-per-prompt '{"video": 1}' \
|
||||
--hf-overrides '{"text_config": {"architectures": ["Qwen2_5_VLForSequenceClassification"]}}'
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import pprint
|
||||
|
||||
import requests
|
||||
|
||||
from vllm.multimodal.utils import encode_image_url, fetch_image
|
||||
|
||||
input_text = "This product was excellent and exceeded my expectations"
|
||||
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
|
||||
image_base64 = {"url": encode_image_url(fetch_image(image_url))}
|
||||
video_url = "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4"
|
||||
|
||||
|
||||
def parse_args():
|
||||
parse = argparse.ArgumentParser()
|
||||
parse.add_argument("--host", type=str, default="localhost")
|
||||
parse.add_argument("--port", type=int, default=8000)
|
||||
return parse.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
base_url = f"http://{args.host}:{args.port}"
|
||||
models_url = base_url + "/v1/models"
|
||||
classify_url = base_url + "/classify"
|
||||
|
||||
response = requests.get(models_url)
|
||||
model_name = response.json()["data"][0]["id"]
|
||||
|
||||
print("Text classification output:")
|
||||
messages = [
|
||||
{
|
||||
"role": "assistant",
|
||||
"content": "Please classify this text request.",
|
||||
},
|
||||
{
|
||||
"role": "user",
|
||||
"content": input_text,
|
||||
},
|
||||
]
|
||||
response = requests.post(
|
||||
classify_url,
|
||||
json={"model": model_name, "messages": messages},
|
||||
)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Image url classification output:")
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Please classify this image."},
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
],
|
||||
}
|
||||
]
|
||||
response = requests.post(
|
||||
classify_url,
|
||||
json={"model": model_name, "messages": messages},
|
||||
)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Image base64 classification output:")
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Please classify this image."},
|
||||
{"type": "image_url", "image_url": image_base64},
|
||||
],
|
||||
}
|
||||
]
|
||||
response = requests.post(
|
||||
classify_url,
|
||||
json={"model": model_name, "messages": messages},
|
||||
)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Video url classification output:")
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Please classify this video."},
|
||||
{"type": "video_url", "video_url": {"url": video_url}},
|
||||
],
|
||||
}
|
||||
]
|
||||
response = requests.post(
|
||||
classify_url,
|
||||
json={"model": model_name, "messages": messages},
|
||||
)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@@ -11,23 +11,79 @@ on HuggingFace model repository.
|
||||
|
||||
import argparse
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
|
||||
from PIL.Image import Image
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils.print_utils import print_embeddings
|
||||
|
||||
ROOT_DIR = Path(__file__).parent.parent.parent
|
||||
EMBED_TEMPLATE_DIR = ROOT_DIR / "pooling/embed/template/"
|
||||
|
||||
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
|
||||
text = "A cat standing in the snow."
|
||||
multi_modal_data = {"image": fetch_image(image_url)}
|
||||
|
||||
|
||||
def print_embeddings(embeds: list[float]):
|
||||
embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
|
||||
print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
|
||||
def run_clip(seed: int):
|
||||
engine_args = EngineArgs(
|
||||
model="openai/clip-vit-base-patch32",
|
||||
runner="pooling",
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
llm = LLM(**asdict(engine_args) | {"seed": seed})
|
||||
|
||||
print("Text embedding output:")
|
||||
outputs = llm.embed(text, use_tqdm=False)
|
||||
print_embeddings(outputs[0].outputs.embedding)
|
||||
|
||||
print("Image embedding output:")
|
||||
prompt = "" # For image input, make sure that the prompt text is empty
|
||||
outputs = llm.embed(
|
||||
{
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": multi_modal_data,
|
||||
},
|
||||
use_tqdm=False,
|
||||
)
|
||||
print_embeddings(outputs[0].outputs.embedding)
|
||||
|
||||
|
||||
def run_qwen3_vl():
|
||||
def run_e5_v(seed: int):
|
||||
engine_args = EngineArgs(
|
||||
model="royokong/e5-v",
|
||||
runner="pooling",
|
||||
max_model_len=4096,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
llm = LLM(**asdict(engine_args) | {"seed": seed})
|
||||
|
||||
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
|
||||
|
||||
print("Text embedding output:")
|
||||
prompt_text = llama3_template.format(
|
||||
f"{text}\nSummary above sentence in one word: "
|
||||
)
|
||||
outputs = llm.embed(prompt_text, use_tqdm=False)
|
||||
print_embeddings(outputs[0].outputs.embedding)
|
||||
|
||||
print("Image embedding output:")
|
||||
prompt_image = llama3_template.format("<image>\nSummary above image in one word: ")
|
||||
outputs = llm.embed(
|
||||
{
|
||||
"prompt": prompt_image,
|
||||
"multi_modal_data": multi_modal_data,
|
||||
},
|
||||
use_tqdm=False,
|
||||
)
|
||||
print_embeddings(outputs[0].outputs.embedding)
|
||||
|
||||
|
||||
def run_qwen3_vl(seed: int):
|
||||
try:
|
||||
from qwen_vl_utils import smart_resize
|
||||
except ModuleNotFoundError:
|
||||
@@ -61,20 +117,20 @@ def run_qwen3_vl():
|
||||
)
|
||||
default_instruction = "Represent the user's input."
|
||||
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
text_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
|
||||
image_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
|
||||
image_text_prompt = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"
|
||||
prompt_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{text}<|im_end|>\n<|im_start|>assistant\n"
|
||||
prompt_image = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}<|im_end|>\n<|im_start|>assistant\n"
|
||||
prompt_image_text = f"<|im_start|>system\n{default_instruction}<|im_end|>\n<|im_start|>user\n{image_placeholder}{text}<|im_end|>\n<|im_start|>assistant\n"
|
||||
|
||||
llm = LLM(**asdict(engine_args))
|
||||
llm = LLM(**asdict(engine_args) | {"seed": seed})
|
||||
|
||||
print("Text embedding output:")
|
||||
outputs = llm.embed(text_prompt, use_tqdm=False)
|
||||
outputs = llm.embed(prompt_text, use_tqdm=False)
|
||||
print_embeddings(outputs[0].outputs.embedding)
|
||||
|
||||
print("Image embedding output:")
|
||||
outputs = llm.embed(
|
||||
{
|
||||
"prompt": image_prompt,
|
||||
"prompt": prompt_image,
|
||||
"multi_modal_data": multi_modal_data,
|
||||
},
|
||||
use_tqdm=False,
|
||||
@@ -84,7 +140,162 @@ def run_qwen3_vl():
|
||||
print("Image+Text embedding output:")
|
||||
outputs = llm.embed(
|
||||
{
|
||||
"prompt": image_text_prompt,
|
||||
"prompt": prompt_image_text,
|
||||
"multi_modal_data": multi_modal_data,
|
||||
},
|
||||
use_tqdm=False,
|
||||
)
|
||||
print_embeddings(outputs[0].outputs.embedding)
|
||||
|
||||
|
||||
def run_siglip(seed: int):
|
||||
engine_args = EngineArgs(
|
||||
model="google/siglip-base-patch16-224",
|
||||
runner="pooling",
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
llm = LLM(**asdict(engine_args) | {"seed": seed})
|
||||
|
||||
print("Text embedding output:")
|
||||
outputs = llm.embed(text, use_tqdm=False)
|
||||
print_embeddings(outputs[0].outputs.embedding)
|
||||
|
||||
print("Image embedding output:")
|
||||
prompt = "" # For image input, make sure that the prompt text is empty
|
||||
outputs = llm.embed(
|
||||
{
|
||||
"prompt": prompt,
|
||||
"multi_modal_data": multi_modal_data,
|
||||
},
|
||||
use_tqdm=False,
|
||||
)
|
||||
print_embeddings(outputs[0].outputs.embedding)
|
||||
|
||||
|
||||
def run_vlm2vec_phi3v(seed: int):
|
||||
engine_args = EngineArgs(
|
||||
model="TIGER-Lab/VLM2Vec-Full",
|
||||
runner="pooling",
|
||||
max_model_len=4096,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={"num_crops": 4},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
llm = LLM(**asdict(engine_args) | {"seed": seed})
|
||||
image_token = "<|image_1|>"
|
||||
|
||||
print("Text embedding output:")
|
||||
prompt_text = f"Find me an everyday image that matches the given caption: {text}"
|
||||
outputs = llm.embed(prompt_text, use_tqdm=False)
|
||||
print_embeddings(outputs[0].outputs.embedding)
|
||||
|
||||
print("Image embedding output:")
|
||||
prompt_image = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501
|
||||
outputs = llm.embed(
|
||||
{
|
||||
"prompt": prompt_image,
|
||||
"multi_modal_data": multi_modal_data,
|
||||
},
|
||||
use_tqdm=False,
|
||||
)
|
||||
print_embeddings(outputs[0].outputs.embedding)
|
||||
|
||||
print("Image+Text embedding output:")
|
||||
prompt_image_text = (
|
||||
f"{image_token} Represent the given image with the following question: {text}" # noqa: E501
|
||||
)
|
||||
outputs = llm.embed(
|
||||
{
|
||||
"prompt": prompt_image_text,
|
||||
"multi_modal_data": multi_modal_data,
|
||||
},
|
||||
use_tqdm=False,
|
||||
)
|
||||
print_embeddings(outputs[0].outputs.embedding)
|
||||
|
||||
|
||||
def run_vlm2vec_qwen2vl(seed: int):
|
||||
# vLLM does not support LoRA adapters on multi-modal encoder,
|
||||
# so we merge the weights first
|
||||
from huggingface_hub.constants import HF_HUB_CACHE
|
||||
from peft import PeftConfig, PeftModel
|
||||
from transformers import AutoModelForImageTextToText, AutoProcessor
|
||||
|
||||
from vllm.entrypoints.chat_utils import load_chat_template
|
||||
|
||||
model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B"
|
||||
|
||||
base_model = AutoModelForImageTextToText.from_pretrained(model_id)
|
||||
lora_model = PeftModel.from_pretrained(
|
||||
base_model,
|
||||
model_id,
|
||||
config=PeftConfig.from_pretrained(model_id),
|
||||
)
|
||||
model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
|
||||
model._hf_peft_config_loaded = False # Needed to save the merged model
|
||||
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
model_id,
|
||||
# `min_pixels` and `max_pixels` are deprecated for
|
||||
# transformers `preprocessor_config.json`
|
||||
size={"shortest_edge": 3136, "longest_edge": 12845056},
|
||||
)
|
||||
processor.chat_template = load_chat_template(
|
||||
# The original chat template is not correct
|
||||
EMBED_TEMPLATE_DIR / "vlm2vec_qwen2vl.jinja",
|
||||
)
|
||||
|
||||
merged_path = str(
|
||||
Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm")
|
||||
)
|
||||
print(f"Saving merged model to {merged_path}...")
|
||||
print(
|
||||
"NOTE: This directory is not tracked by `huggingface_hub` "
|
||||
"so you have to delete this manually if you don't want it anymore."
|
||||
)
|
||||
model.save_pretrained(merged_path)
|
||||
processor.save_pretrained(merged_path)
|
||||
print("Done!")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=merged_path,
|
||||
runner="pooling",
|
||||
max_model_len=4096,
|
||||
mm_processor_kwargs={
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 12845056,
|
||||
},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
llm = LLM(**asdict(engine_args) | {"seed": seed})
|
||||
image_token = "<|image_pad|>"
|
||||
|
||||
print("Text embedding output:")
|
||||
prompt_text = f"Find me an everyday image that matches the given caption: {text}"
|
||||
outputs = llm.embed(prompt_text, use_tqdm=False)
|
||||
print_embeddings(outputs[0].outputs.embedding)
|
||||
|
||||
print("Image embedding output:")
|
||||
prompt_image = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501
|
||||
outputs = llm.embed(
|
||||
{
|
||||
"prompt": prompt_image,
|
||||
"multi_modal_data": multi_modal_data,
|
||||
},
|
||||
use_tqdm=False,
|
||||
)
|
||||
print_embeddings(outputs[0].outputs.embedding)
|
||||
|
||||
print("Image+Text embedding output:")
|
||||
prompt_image_text = (
|
||||
f"{image_token} Represent the given image with the following question: {text}" # noqa: E501
|
||||
)
|
||||
outputs = llm.embed(
|
||||
{
|
||||
"prompt": prompt_image_text,
|
||||
"multi_modal_data": multi_modal_data,
|
||||
},
|
||||
use_tqdm=False,
|
||||
@@ -93,7 +304,12 @@ def run_qwen3_vl():
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"clip": run_clip,
|
||||
"e5_v": run_e5_v,
|
||||
"qwen3_vl": run_qwen3_vl,
|
||||
"siglip": run_siglip,
|
||||
"vlm2vec_phi3v": run_vlm2vec_phi3v,
|
||||
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
|
||||
}
|
||||
|
||||
|
||||
@@ -103,16 +319,23 @@ def parse_args():
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
"-m",
|
||||
type=str,
|
||||
default="vlm2vec_phi3v",
|
||||
choices=model_example_map.keys(),
|
||||
required=True,
|
||||
help="The name of the embedding model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seed",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Set the seed when initializing `vllm.LLM`.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args):
|
||||
model_example_map[args.model]()
|
||||
model_example_map[args.model](args.seed)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -17,6 +17,8 @@ from openai.types.chat import ChatCompletionMessageParam
|
||||
from openai.types.create_embedding_response import CreateEmbeddingResponse
|
||||
from PIL import Image
|
||||
|
||||
from vllm.utils.print_utils import print_embeddings
|
||||
|
||||
# Modify OpenAI's API key and API base to use vLLM's API server.
|
||||
openai_api_key = "EMPTY"
|
||||
openai_api_base = "http://localhost:8000/v1"
|
||||
@@ -51,11 +53,6 @@ def create_chat_embeddings(
|
||||
)
|
||||
|
||||
|
||||
def print_embeddings(embeds):
|
||||
embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
|
||||
print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
|
||||
|
||||
|
||||
def run_clip(client: OpenAI, model: str):
|
||||
"""
|
||||
Start the server using:
|
||||
@@ -105,7 +102,7 @@ def run_dse_qwen2_vl(client: OpenAI, model: str):
|
||||
--runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 8192 \
|
||||
--chat-template examples/template_dse_qwen2_vl.jinja
|
||||
--chat-template examples/pooling/embed/template/dse_qwen2_vl.jinja
|
||||
"""
|
||||
response = create_chat_embeddings(
|
||||
client,
|
||||
@@ -316,7 +313,7 @@ def run_vlm2vec(client: OpenAI, model: str):
|
||||
--runner pooling \
|
||||
--trust-remote-code \
|
||||
--max-model-len 4096 \
|
||||
--chat-template examples/template_vlm2vec_phi3v.jinja
|
||||
--chat-template examples/pooling/embed/template/vlm2vec_phi3v.jinja
|
||||
"""
|
||||
|
||||
response = create_chat_embeddings(
|
||||
|
||||
@@ -1,441 +0,0 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""
|
||||
This example shows how to use vLLM for running offline inference with
|
||||
the correct prompt format on vision language models for multimodal pooling.
|
||||
|
||||
For most models, the prompt format should follow corresponding examples
|
||||
on HuggingFace model repository.
|
||||
"""
|
||||
|
||||
from argparse import Namespace
|
||||
from dataclasses import asdict
|
||||
from pathlib import Path
|
||||
from typing import Literal, NamedTuple, TypeAlias, TypedDict, get_args
|
||||
|
||||
from PIL.Image import Image
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
ROOT_DIR = Path(__file__).parent.parent.parent
|
||||
EXAMPLES_DIR = ROOT_DIR / "examples"
|
||||
|
||||
|
||||
class TextQuery(TypedDict):
|
||||
modality: Literal["text"]
|
||||
text: str
|
||||
|
||||
|
||||
class ImageQuery(TypedDict):
|
||||
modality: Literal["image"]
|
||||
image: Image
|
||||
|
||||
|
||||
class TextImageQuery(TypedDict):
|
||||
modality: Literal["text+image"]
|
||||
text: str
|
||||
image: Image
|
||||
|
||||
|
||||
class TextImagesQuery(TypedDict):
|
||||
modality: Literal["text+images"]
|
||||
text: str
|
||||
image: ScoreMultiModalParam
|
||||
|
||||
|
||||
QueryModality = Literal["text", "image", "text+image", "text+images"]
|
||||
Query: TypeAlias = TextQuery | ImageQuery | TextImageQuery | TextImagesQuery
|
||||
|
||||
|
||||
class ModelRequestData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
prompt: str | None = None
|
||||
image: Image | None = None
|
||||
query: str | None = None
|
||||
documents: ScoreMultiModalParam | None = None
|
||||
|
||||
|
||||
def run_clip(query: Query) -> ModelRequestData:
|
||||
if query["modality"] == "text":
|
||||
prompt = query["text"]
|
||||
image = None
|
||||
elif query["modality"] == "image":
|
||||
prompt = "" # For image input, make sure that the prompt text is empty
|
||||
image = query["image"]
|
||||
else:
|
||||
modality = query["modality"]
|
||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="openai/clip-vit-base-patch32",
|
||||
runner="pooling",
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
|
||||
|
||||
def run_e5_v(query: Query) -> ModelRequestData:
|
||||
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
|
||||
|
||||
if query["modality"] == "text":
|
||||
text = query["text"]
|
||||
prompt = llama3_template.format(f"{text}\nSummary above sentence in one word: ")
|
||||
image = None
|
||||
elif query["modality"] == "image":
|
||||
prompt = llama3_template.format("<image>\nSummary above image in one word: ")
|
||||
image = query["image"]
|
||||
else:
|
||||
modality = query["modality"]
|
||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="royokong/e5-v",
|
||||
runner="pooling",
|
||||
max_model_len=4096,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
|
||||
|
||||
def run_jinavl_reranker(query: Query) -> ModelRequestData:
|
||||
if query["modality"] != "text+images":
|
||||
raise ValueError(f"Unsupported query modality: '{query['modality']}'")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="jinaai/jina-reranker-m0",
|
||||
runner="pooling",
|
||||
max_model_len=32768,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 602112,
|
||||
},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
query=query["text"],
|
||||
documents=query["image"],
|
||||
)
|
||||
|
||||
|
||||
def run_qwen3_vl(query: Query) -> ModelRequestData:
|
||||
image_placeholder = "<vision_start><|image_pad|><vision_end>"
|
||||
if query["modality"] == "text":
|
||||
prompt = query["text"]
|
||||
image = None
|
||||
elif query["modality"] == "image":
|
||||
prompt = image_placeholder
|
||||
image = query["image"]
|
||||
elif query["modality"] == "text+image":
|
||||
text = query["text"]
|
||||
prompt = f"{image_placeholder}\n{text}"
|
||||
image = query["image"]
|
||||
else:
|
||||
modality = query["modality"]
|
||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="Qwen/Qwen3-VL-Embedding-2B",
|
||||
runner="pooling",
|
||||
max_model_len=8192,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
|
||||
|
||||
def run_siglip(query: Query) -> ModelRequestData:
|
||||
if query["modality"] == "text":
|
||||
prompt = query["text"]
|
||||
image = None
|
||||
elif query["modality"] == "image":
|
||||
prompt = "" # For image input, make sure that the prompt text is empty
|
||||
image = query["image"]
|
||||
else:
|
||||
modality = query["modality"]
|
||||
raise ValueError(f"Unsupported query modality: '{modality}'")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="google/siglip-base-patch16-224",
|
||||
runner="pooling",
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
|
||||
|
||||
def _get_vlm2vec_prompt_image(query: Query, image_token: str):
|
||||
if query["modality"] == "text":
|
||||
text = query["text"]
|
||||
prompt = f"Find me an everyday image that matches the given caption: {text}"
|
||||
image = None
|
||||
elif query["modality"] == "image":
|
||||
prompt = f"{image_token} Find a day-to-day image that looks similar to the provided image." # noqa: E501
|
||||
image = query["image"]
|
||||
elif query["modality"] == "text+image":
|
||||
text = query["text"]
|
||||
prompt = f"{image_token} Represent the given image with the following question: {text}" # noqa: E501
|
||||
image = query["image"]
|
||||
else:
|
||||
modality = query["modality"]
|
||||
raise ValueError(f"Unsupported query modality: {modality!r}")
|
||||
|
||||
return prompt, image
|
||||
|
||||
|
||||
def run_vlm2vec_phi3v(query: Query) -> ModelRequestData:
|
||||
prompt, image = _get_vlm2vec_prompt_image(query, "<|image_1|>")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model="TIGER-Lab/VLM2Vec-Full",
|
||||
runner="pooling",
|
||||
max_model_len=4096,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs={"num_crops": 4},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
|
||||
|
||||
def run_vlm2vec_qwen2vl(query: Query) -> ModelRequestData:
|
||||
# vLLM does not support LoRA adapters on multi-modal encoder,
|
||||
# so we merge the weights first
|
||||
from huggingface_hub.constants import HF_HUB_CACHE
|
||||
from peft import PeftConfig, PeftModel
|
||||
from transformers import AutoModelForImageTextToText, AutoProcessor
|
||||
|
||||
from vllm.entrypoints.chat_utils import load_chat_template
|
||||
|
||||
model_id = "TIGER-Lab/VLM2Vec-Qwen2VL-2B"
|
||||
|
||||
base_model = AutoModelForImageTextToText.from_pretrained(model_id)
|
||||
lora_model = PeftModel.from_pretrained(
|
||||
base_model,
|
||||
model_id,
|
||||
config=PeftConfig.from_pretrained(model_id),
|
||||
)
|
||||
model = lora_model.merge_and_unload().to(dtype=base_model.dtype)
|
||||
model._hf_peft_config_loaded = False # Needed to save the merged model
|
||||
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
model_id,
|
||||
# `min_pixels` and `max_pixels` are deprecated for
|
||||
# transformers `preprocessor_config.json`
|
||||
size={"shortest_edge": 3136, "longest_edge": 12845056},
|
||||
)
|
||||
processor.chat_template = load_chat_template(
|
||||
# The original chat template is not correct
|
||||
EXAMPLES_DIR / "template_vlm2vec_qwen2vl.jinja",
|
||||
)
|
||||
|
||||
merged_path = str(
|
||||
Path(HF_HUB_CACHE) / ("models--" + model_id.replace("/", "--") + "-vllm")
|
||||
)
|
||||
print(f"Saving merged model to {merged_path}...")
|
||||
print(
|
||||
"NOTE: This directory is not tracked by `huggingface_hub` "
|
||||
"so you have to delete this manually if you don't want it anymore."
|
||||
)
|
||||
model.save_pretrained(merged_path)
|
||||
processor.save_pretrained(merged_path)
|
||||
print("Done!")
|
||||
|
||||
prompt, image = _get_vlm2vec_prompt_image(query, "<|image_pad|>")
|
||||
|
||||
engine_args = EngineArgs(
|
||||
model=merged_path,
|
||||
runner="pooling",
|
||||
max_model_len=4096,
|
||||
mm_processor_kwargs={
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 12845056,
|
||||
},
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
)
|
||||
|
||||
return ModelRequestData(
|
||||
engine_args=engine_args,
|
||||
prompt=prompt,
|
||||
image=image,
|
||||
)
|
||||
|
||||
|
||||
def get_query(modality: QueryModality):
|
||||
if modality == "text":
|
||||
return TextQuery(modality="text", text="A dog sitting in the grass")
|
||||
|
||||
if modality == "image":
|
||||
return ImageQuery(
|
||||
modality="image",
|
||||
image=fetch_image(
|
||||
"https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/eskimo.jpg" # noqa: E501
|
||||
),
|
||||
)
|
||||
|
||||
if modality == "text+image":
|
||||
return TextImageQuery(
|
||||
modality="text+image",
|
||||
text="A cat standing in the snow.",
|
||||
image=fetch_image(
|
||||
"https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg" # noqa: E501
|
||||
),
|
||||
)
|
||||
|
||||
if modality == "text+images":
|
||||
return TextImagesQuery(
|
||||
modality="text+images",
|
||||
text="slm markdown",
|
||||
image={
|
||||
"content": [
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/handelsblatt-preview.png"
|
||||
},
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png"
|
||||
},
|
||||
},
|
||||
]
|
||||
},
|
||||
)
|
||||
|
||||
msg = f"Modality {modality} is not supported."
|
||||
raise ValueError(msg)
|
||||
|
||||
|
||||
def run_encode(model: str, modality: QueryModality, seed: int):
|
||||
query = get_query(modality)
|
||||
req_data = model_example_map[model](query)
|
||||
|
||||
# Disable other modalities to save memory
|
||||
default_limits = {"image": 0, "video": 0, "audio": 0}
|
||||
req_data.engine_args.limit_mm_per_prompt = default_limits | dict(
|
||||
req_data.engine_args.limit_mm_per_prompt or {}
|
||||
)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
mm_data = {}
|
||||
if req_data.image is not None:
|
||||
mm_data["image"] = req_data.image
|
||||
|
||||
outputs = llm.embed(
|
||||
{
|
||||
"prompt": req_data.prompt,
|
||||
"multi_modal_data": mm_data,
|
||||
}
|
||||
)
|
||||
|
||||
print("-" * 50)
|
||||
for output in outputs:
|
||||
print(output.outputs.embedding)
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
def run_score(model: str, modality: QueryModality, seed: int):
|
||||
query = get_query(modality)
|
||||
req_data = model_example_map[model](query)
|
||||
|
||||
engine_args = asdict(req_data.engine_args) | {"seed": seed}
|
||||
llm = LLM(**engine_args)
|
||||
|
||||
outputs = llm.score(req_data.query, req_data.documents)
|
||||
|
||||
print("-" * 30)
|
||||
print([output.outputs.score for output in outputs])
|
||||
print("-" * 30)
|
||||
|
||||
|
||||
model_example_map = {
|
||||
"clip": run_clip,
|
||||
"e5_v": run_e5_v,
|
||||
"jinavl_reranker": run_jinavl_reranker,
|
||||
"qwen3_vl": run_qwen3_vl,
|
||||
"siglip": run_siglip,
|
||||
"vlm2vec_phi3v": run_vlm2vec_phi3v,
|
||||
"vlm2vec_qwen2vl": run_vlm2vec_qwen2vl,
|
||||
}
|
||||
|
||||
|
||||
def parse_args():
|
||||
parser = FlexibleArgumentParser(
|
||||
description="Demo on using vLLM for offline inference with "
|
||||
"vision language models for multimodal pooling tasks."
|
||||
)
|
||||
parser.add_argument(
|
||||
"--model-name",
|
||||
"-m",
|
||||
type=str,
|
||||
default="vlm2vec_phi3v",
|
||||
choices=model_example_map.keys(),
|
||||
help="The name of the embedding model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--task",
|
||||
"-t",
|
||||
type=str,
|
||||
default="embedding",
|
||||
choices=["embedding", "scoring"],
|
||||
help="The task type.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--modality",
|
||||
type=str,
|
||||
default="image",
|
||||
choices=get_args(QueryModality),
|
||||
help="Modality of the input.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--seed",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Set the seed when initializing `vllm.LLM`.",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
if args.task == "embedding":
|
||||
run_encode(args.model_name, args.modality, args.seed)
|
||||
elif args.task == "scoring":
|
||||
run_score(args.model_name, args.modality, args.seed)
|
||||
else:
|
||||
raise ValueError(f"Unsupported task: {args.task}")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
main(args)
|
||||
@@ -30,6 +30,7 @@ document = (
|
||||
"as the dog offers its paw in a heartwarming display of companionship and trust."
|
||||
)
|
||||
image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
|
||||
video_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
|
||||
documents = [
|
||||
{
|
||||
"type": "text",
|
||||
@@ -43,6 +44,10 @@ documents = [
|
||||
"type": "image_url",
|
||||
"image_url": {"url": encode_image_url(fetch_image(image_url))},
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@@ -89,6 +94,15 @@ def main(args):
|
||||
response = requests.post(rerank_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Query: string & Document: video url")
|
||||
prompt = {
|
||||
"model": model,
|
||||
"query": query,
|
||||
"documents": {"content": [documents[3]]},
|
||||
}
|
||||
response = requests.post(rerank_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Query: string & Document: text + image url")
|
||||
prompt = {
|
||||
"model": model,
|
||||
|
||||
@@ -15,20 +15,47 @@ from pathlib import Path
|
||||
from typing import NamedTuple
|
||||
|
||||
from vllm import LLM, EngineArgs
|
||||
from vllm.entrypoints.pooling.score.utils import ScoreMultiModalParam
|
||||
from vllm.multimodal.utils import encode_image_url, fetch_image
|
||||
from vllm.utils.argparse_utils import FlexibleArgumentParser
|
||||
|
||||
TEMPLATE_HOME = Path(__file__).parent / "template"
|
||||
|
||||
|
||||
query = "A woman playing with her dog on a beach at sunset."
|
||||
document = (
|
||||
"A woman shares a joyful moment with her golden retriever on a sun-drenched "
|
||||
"beach at sunset, as the dog offers its paw in a heartwarming display of "
|
||||
"companionship and trust."
|
||||
)
|
||||
image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
|
||||
video_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
|
||||
documents = [
|
||||
{
|
||||
"type": "text",
|
||||
"text": document,
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": image_url},
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {"url": encode_image_url(fetch_image(image_url))},
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
class RerankModelData(NamedTuple):
|
||||
engine_args: EngineArgs
|
||||
chat_template: str | None = None
|
||||
modality: set[str] = {}
|
||||
|
||||
|
||||
def run_jinavl_reranker(modality: str) -> RerankModelData:
|
||||
assert modality == "image"
|
||||
|
||||
def run_jinavl_reranker() -> RerankModelData:
|
||||
engine_args = EngineArgs(
|
||||
model="jinaai/jina-reranker-m0",
|
||||
runner="pooling",
|
||||
@@ -38,19 +65,15 @@ def run_jinavl_reranker(modality: str) -> RerankModelData:
|
||||
"min_pixels": 3136,
|
||||
"max_pixels": 602112,
|
||||
},
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
)
|
||||
return RerankModelData(
|
||||
engine_args=engine_args,
|
||||
)
|
||||
return RerankModelData(engine_args=engine_args, modality={"image"})
|
||||
|
||||
|
||||
def run_qwen3_vl_reranker(modality: str) -> RerankModelData:
|
||||
def run_qwen3_vl_reranker() -> RerankModelData:
|
||||
engine_args = EngineArgs(
|
||||
model="Qwen/Qwen3-VL-Reranker-2B",
|
||||
runner="pooling",
|
||||
max_model_len=16384,
|
||||
limit_mm_per_prompt={modality: 1},
|
||||
# HuggingFace model configuration overrides required for compatibility
|
||||
hf_overrides={
|
||||
# Manually route to sequence classification architecture
|
||||
@@ -71,10 +94,11 @@ def run_qwen3_vl_reranker(modality: str) -> RerankModelData:
|
||||
return RerankModelData(
|
||||
engine_args=engine_args,
|
||||
chat_template=chat_template,
|
||||
modality={"image", "video"},
|
||||
)
|
||||
|
||||
|
||||
model_example_map: dict[str, Callable[[str], RerankModelData]] = {
|
||||
model_example_map: dict[str, Callable[[], RerankModelData]] = {
|
||||
"jinavl_reranker": run_jinavl_reranker,
|
||||
"qwen3_vl_reranker": run_qwen3_vl_reranker,
|
||||
}
|
||||
@@ -93,78 +117,67 @@ def parse_args():
|
||||
choices=model_example_map.keys(),
|
||||
help="The name of the reranker model.",
|
||||
)
|
||||
parser.add_argument(
|
||||
"--modality",
|
||||
type=str,
|
||||
default="image",
|
||||
choices=["image", "video"],
|
||||
help="Modality of the multimodal input (image or video).",
|
||||
)
|
||||
return parser.parse_args()
|
||||
|
||||
|
||||
def get_multi_modal_input(modality: str) -> tuple[str, ScoreMultiModalParam]:
|
||||
# Sample query for testing the reranker
|
||||
if modality == "image":
|
||||
query = "A woman playing with her dog on a beach at sunset."
|
||||
# Sample multimodal documents to be scored against the query
|
||||
# Each document contains an image URL that will be fetched and processed
|
||||
documents: ScoreMultiModalParam = {
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": (
|
||||
"A woman shares a joyful moment with her golden retriever on a sun-drenched beach at sunset, " # noqa: E501
|
||||
"as the dog offers its paw in a heartwarming display of companionship and trust." # noqa: E501
|
||||
),
|
||||
},
|
||||
{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
|
||||
},
|
||||
},
|
||||
]
|
||||
}
|
||||
elif modality == "video":
|
||||
query = "A girl is drawing pictures on an ipad."
|
||||
# Sample video documents to be scored against the query
|
||||
documents: ScoreMultiModalParam = {
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": "A girl is drawing a guitar on her ipad with Apple Pencil.",
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {
|
||||
"url": "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
|
||||
},
|
||||
},
|
||||
]
|
||||
}
|
||||
else:
|
||||
raise ValueError(f"Unsupported modality: {modality}")
|
||||
return query, documents
|
||||
|
||||
|
||||
def main(args: Namespace):
|
||||
# Run the selected reranker model
|
||||
modality = args.modality
|
||||
model_request = model_example_map[args.model_name](modality)
|
||||
model_request = model_example_map[args.model_name]()
|
||||
engine_args = model_request.engine_args
|
||||
|
||||
llm = LLM(**asdict(engine_args))
|
||||
|
||||
query, documents = get_multi_modal_input(modality)
|
||||
outputs = llm.score(query, documents, chat_template=model_request.chat_template)
|
||||
|
||||
print("-" * 50)
|
||||
print(f"Model: {engine_args.model}")
|
||||
print(f"Modality: {modality}")
|
||||
print(f"Query: {query}")
|
||||
print("Query: string & Document: string")
|
||||
outputs = llm.score(query, document)
|
||||
print("Relevance scores:", [output.outputs.score for output in outputs])
|
||||
|
||||
print("Query: string & Document: text")
|
||||
outputs = llm.score(
|
||||
query, {"content": [documents[0]]}, chat_template=model_request.chat_template
|
||||
)
|
||||
print("Relevance scores:", [output.outputs.score for output in outputs])
|
||||
|
||||
print("Query: string & Document: image url")
|
||||
outputs = llm.score(
|
||||
query, {"content": [documents[1]]}, chat_template=model_request.chat_template
|
||||
)
|
||||
print("Relevance scores:", [output.outputs.score for output in outputs])
|
||||
|
||||
print("Query: string & Document: image base64")
|
||||
outputs = llm.score(
|
||||
query, {"content": [documents[2]]}, chat_template=model_request.chat_template
|
||||
)
|
||||
print("Relevance scores:", [output.outputs.score for output in outputs])
|
||||
|
||||
if "video" in model_request.modality:
|
||||
print("Query: string & Document: video url")
|
||||
outputs = llm.score(
|
||||
query,
|
||||
{"content": [documents[3]]},
|
||||
chat_template=model_request.chat_template,
|
||||
)
|
||||
print("Relevance scores:", [output.outputs.score for output in outputs])
|
||||
|
||||
print("Query: string & Document: text + image url")
|
||||
outputs = llm.score(
|
||||
query,
|
||||
{"content": [documents[0], documents[1]]},
|
||||
chat_template=model_request.chat_template,
|
||||
)
|
||||
print("Relevance scores:", [output.outputs.score for output in outputs])
|
||||
|
||||
print("Query: string & Document: list")
|
||||
outputs = llm.score(
|
||||
query,
|
||||
[
|
||||
document,
|
||||
{"content": [documents[0]]},
|
||||
{"content": [documents[1]]},
|
||||
{"content": [documents[0], documents[1]]},
|
||||
],
|
||||
chat_template=model_request.chat_template,
|
||||
)
|
||||
print("Relevance scores:", [output.outputs.score for output in outputs])
|
||||
print("-" * 50)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
|
||||
@@ -29,6 +29,7 @@ document = (
|
||||
"as the dog offers its paw in a heartwarming display of companionship and trust."
|
||||
)
|
||||
image_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg"
|
||||
video_url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen3-Omni/demo/draw.mp4"
|
||||
documents = [
|
||||
{
|
||||
"type": "text",
|
||||
@@ -42,6 +43,10 @@ documents = [
|
||||
"type": "image_url",
|
||||
"image_url": {"url": encode_image_url(fetch_image(image_url))},
|
||||
},
|
||||
{
|
||||
"type": "video_url",
|
||||
"video_url": {"url": video_url},
|
||||
},
|
||||
]
|
||||
|
||||
|
||||
@@ -92,6 +97,15 @@ def main(args):
|
||||
response = requests.post(score_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Query: string & Document: video url")
|
||||
prompt = {
|
||||
"model": model,
|
||||
"queries": query,
|
||||
"documents": {"content": [documents[3]]},
|
||||
}
|
||||
response = requests.post(score_url, json=prompt)
|
||||
pprint.pprint(response.json())
|
||||
|
||||
print("Query: string & Document: text + image url")
|
||||
prompt = {
|
||||
"model": model,
|
||||
|
||||
@@ -7,12 +7,15 @@ import pytest
|
||||
import torch
|
||||
|
||||
from tests.models.utils import softmax
|
||||
from vllm import LLM, PoolingParams
|
||||
from vllm import LLM, ClassificationRequestOutput, PoolingParams, PoolingRequestOutput
|
||||
from vllm.distributed import cleanup_dist_env_and_memory
|
||||
from vllm.tasks import PoolingTask
|
||||
|
||||
MODEL_NAME = "jason9693/Qwen2.5-1.5B-apeach"
|
||||
|
||||
prompts = ["The chef prepared a delicious meal."]
|
||||
prompt = "The chef prepared a delicious meal."
|
||||
prompt_token_ids = [785, 29706, 10030, 264, 17923, 15145, 13]
|
||||
num_labels = 2
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
@@ -35,11 +38,48 @@ def llm():
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_str_prompts(llm: LLM):
|
||||
outputs = llm.classify(prompt, use_tqdm=False)
|
||||
assert len(outputs) == 1
|
||||
assert isinstance(outputs[0], ClassificationRequestOutput)
|
||||
assert outputs[0].prompt_token_ids == prompt_token_ids
|
||||
assert len(outputs[0].outputs.probs) == num_labels
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_token_ids_prompts(llm: LLM):
|
||||
outputs = llm.classify([prompt_token_ids], use_tqdm=False)
|
||||
assert len(outputs) == 1
|
||||
assert isinstance(outputs[0], ClassificationRequestOutput)
|
||||
assert outputs[0].prompt_token_ids == prompt_token_ids
|
||||
assert len(outputs[0].outputs.probs) == num_labels
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_list_prompts(llm: LLM):
|
||||
outputs = llm.classify([prompt, prompt_token_ids], use_tqdm=False)
|
||||
assert len(outputs) == 2
|
||||
for i in range(len(outputs)):
|
||||
assert isinstance(outputs[i], ClassificationRequestOutput)
|
||||
assert outputs[i].prompt_token_ids == prompt_token_ids
|
||||
assert len(outputs[i].outputs.probs) == num_labels
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_token_classify(llm: LLM):
|
||||
outputs = llm.encode(prompt, pooling_task="token_classify", use_tqdm=False)
|
||||
assert len(outputs) == 1
|
||||
assert isinstance(outputs[0], PoolingRequestOutput)
|
||||
assert outputs[0].prompt_token_ids == prompt_token_ids
|
||||
assert outputs[0].outputs.data.shape == (len(prompt_token_ids), num_labels)
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_pooling_params(llm: LLM):
|
||||
def get_outputs(use_activation):
|
||||
outputs = llm.classify(
|
||||
prompts,
|
||||
prompt,
|
||||
pooling_params=PoolingParams(use_activation=use_activation),
|
||||
use_tqdm=False,
|
||||
)
|
||||
@@ -61,11 +101,14 @@ def test_pooling_params(llm: LLM):
|
||||
|
||||
|
||||
@pytest.mark.skip_global_cleanup
|
||||
def test_token_classify(llm: LLM):
|
||||
llm.encode(prompts, pooling_task="token_classify", use_tqdm=False)
|
||||
|
||||
|
||||
def test_score_api(llm: LLM):
|
||||
err_msg = "Score API is only enabled for num_labels == 1."
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
llm.score("ping", "pong", use_tqdm=False)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("task", ["embed", "token_embed", "plugin"])
|
||||
def test_unsupported_tasks(llm: LLM, task: PoolingTask):
|
||||
err_msg = f"Unsupported task: '{task}' Supported tasks.+"
|
||||
with pytest.raises(ValueError, match=err_msg):
|
||||
llm.encode(prompt, pooling_task=task, use_tqdm=False)
|
||||
|
||||
@@ -10,12 +10,12 @@ from transformers import AutoProcessor
|
||||
from tests.utils import VLLM_PATH, RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
||||
from vllm.multimodal.media import MediaWithBytes
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.multimodal.utils import encode_image_url, fetch_image
|
||||
|
||||
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
|
||||
MAXIMUM_IMAGES = 2
|
||||
|
||||
vlm2vec_jinja_path = VLLM_PATH / "examples/template_vlm2vec_phi3v.jinja"
|
||||
vlm2vec_jinja_path = VLLM_PATH / "examples/pooling/embed/template/vlm2vec_phi3v.jinja"
|
||||
assert vlm2vec_jinja_path.exists()
|
||||
|
||||
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
|
||||
@@ -26,6 +26,10 @@ TEST_IMAGE_ASSETS = [
|
||||
"RGBA_comp.png", # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
|
||||
]
|
||||
|
||||
input_text = "The best thing about vLLM is that it supports many different models"
|
||||
image_url = "https://vllm-public-assets.s3.us-west-2.amazonaws.com/multimodal_asset/cat_snow.jpg"
|
||||
image_base64 = {"url": encode_image_url(fetch_image(image_url))}
|
||||
|
||||
|
||||
@pytest.fixture(scope="module")
|
||||
def server():
|
||||
@@ -48,6 +52,81 @@ def server():
|
||||
yield remote_server
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
def test_chat_text_request(server: RemoteOpenAIServer, model_name: str):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": input_text,
|
||||
},
|
||||
]
|
||||
|
||||
# note: vlm2vec_phi3v.jinja
|
||||
# Embedding models should only embed one message at a time.
|
||||
|
||||
response = requests.post(
|
||||
server.url_for("v1/embeddings"),
|
||||
json={"model": model_name, "messages": messages},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
output = EmbeddingResponse.model_validate(response.json())
|
||||
assert len(output.data) == 1
|
||||
assert output.model == MODEL_NAME
|
||||
assert len(output.data[0].embedding) == 3072
|
||||
assert output.usage.prompt_tokens == 14
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
def test_chat_image_url_request(server: RemoteOpenAIServer, model_name: str):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Represent the user's input."},
|
||||
{"type": "image_url", "image_url": {"url": image_url}},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
response = requests.post(
|
||||
server.url_for("v1/embeddings"),
|
||||
json={"model": model_name, "messages": messages},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
output = EmbeddingResponse.model_validate(response.json())
|
||||
assert len(output.data) == 1
|
||||
assert output.model == MODEL_NAME
|
||||
assert len(output.data[0].embedding) == 3072
|
||||
assert output.usage.prompt_tokens == 767
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_name", [MODEL_NAME])
|
||||
def test_chat_image_base64_request(server: RemoteOpenAIServer, model_name: str):
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "text", "text": "Represent the user's input."},
|
||||
{"type": "image_url", "image_url": image_base64},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
response = requests.post(
|
||||
server.url_for("v1/embeddings"),
|
||||
json={"model": model_name, "messages": messages},
|
||||
)
|
||||
response.raise_for_status()
|
||||
|
||||
output = EmbeddingResponse.model_validate(response.json())
|
||||
assert len(output.data) == 1
|
||||
assert output.model == MODEL_NAME
|
||||
assert len(output.data[0].embedding) == 3072
|
||||
assert output.usage.prompt_tokens == 767
|
||||
|
||||
|
||||
def get_hf_prompt_tokens(model_name, content, image_url):
|
||||
processor = AutoProcessor.from_pretrained(
|
||||
model_name, trust_remote_code=True, num_crops=4
|
||||
|
||||
@@ -428,13 +428,13 @@ def test_resolve_content_format_fallbacks(model, expected_format):
|
||||
("template_chatglm.jinja", "string"),
|
||||
("template_chatglm2.jinja", "string"),
|
||||
("template_chatml.jinja", "string"),
|
||||
("template_dse_qwen2_vl.jinja", "openai"),
|
||||
("template_falcon_180b.jinja", "string"),
|
||||
("template_falcon.jinja", "string"),
|
||||
("template_inkbot.jinja", "string"),
|
||||
("template_teleflm.jinja", "string"),
|
||||
("template_vlm2vec_phi3v.jinja", "openai"),
|
||||
("template_vlm2vec_qwen2vl.jinja", "openai"),
|
||||
("pooling/embed/template/dse_qwen2_vl.jinja", "openai"),
|
||||
("pooling/embed/template/vlm2vec_phi3v.jinja", "openai"),
|
||||
("pooling/embed/template/vlm2vec_qwen2vl.jinja", "openai"),
|
||||
("tool_chat_template_granite_20b_fc.jinja", "string"),
|
||||
("tool_chat_template_hermes.jinja", "string"),
|
||||
("tool_chat_template_internlm2_tool.jinja", "string"),
|
||||
|
||||
@@ -40,6 +40,21 @@ class PoolingBasicRequestMixin(OpenAIBaseModel):
|
||||
"if the served model does not use priority scheduling."
|
||||
),
|
||||
)
|
||||
mm_processor_kwargs: dict[str, Any] | None = Field(
|
||||
default=None,
|
||||
description="Additional kwargs to pass to the HF processor.",
|
||||
)
|
||||
cache_salt: str | None = Field(
|
||||
default=None,
|
||||
description=(
|
||||
"If specified, the prefix cache will be salted with the provided "
|
||||
"string to prevent an attacker to guess prompts in multi-user "
|
||||
"environments. The salt should be random, protected from "
|
||||
"access by 3rd parties, and long enough to be "
|
||||
"unpredictable (e.g., 43 characters base64-encoded, corresponding "
|
||||
"to 256 bit)."
|
||||
),
|
||||
)
|
||||
# --8<-- [end:pooling-common-extra-params]
|
||||
|
||||
|
||||
|
||||
@@ -2,7 +2,7 @@
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import time
|
||||
from typing import Any, TypeAlias
|
||||
from typing import TypeAlias
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
@@ -48,12 +48,6 @@ class ClassificationCompletionRequest(
|
||||
class ClassificationChatRequest(
|
||||
PoolingBasicRequestMixin, ChatRequestMixin, ClassifyRequestMixin
|
||||
):
|
||||
# --8<-- [start:chat-classification-extra-params]
|
||||
mm_processor_kwargs: dict[str, Any] | None = Field(
|
||||
default=None,
|
||||
description=("Additional kwargs to pass to the HF processor."),
|
||||
)
|
||||
|
||||
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
|
||||
encoder_config = model_config.encoder_config or {}
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import time
|
||||
from typing import Any, TypeAlias
|
||||
from typing import TypeAlias
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
@@ -78,11 +78,6 @@ class EmbeddingCompletionRequest(
|
||||
class EmbeddingChatRequest(
|
||||
PoolingBasicRequestMixin, ChatRequestMixin, EmbedRequestMixin
|
||||
):
|
||||
mm_processor_kwargs: dict[str, Any] | None = Field(
|
||||
default=None,
|
||||
description=("Additional kwargs to pass to the HF processor."),
|
||||
)
|
||||
|
||||
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
|
||||
encoder_config = model_config.encoder_config or {}
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import time
|
||||
from typing import Any, Generic, TypeAlias, TypeVar
|
||||
from typing import Generic, TypeAlias, TypeVar
|
||||
|
||||
from pydantic import Field
|
||||
|
||||
@@ -65,11 +65,6 @@ class PoolingChatRequest(
|
||||
):
|
||||
task: PoolingTask | None = None
|
||||
|
||||
mm_processor_kwargs: dict[str, Any] | None = Field(
|
||||
default=None,
|
||||
description=("Additional kwargs to pass to the HF processor."),
|
||||
)
|
||||
|
||||
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
|
||||
encoder_config = model_config.encoder_config or {}
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import time
|
||||
from typing import Any, TypeAlias
|
||||
from typing import TypeAlias
|
||||
|
||||
from pydantic import BaseModel, Field
|
||||
|
||||
@@ -23,13 +23,6 @@ from vllm.utils import random_uuid
|
||||
|
||||
|
||||
class ScoreRequestMixin(PoolingBasicRequestMixin, ClassifyRequestMixin):
|
||||
# --8<-- [start:score-extra-params]
|
||||
mm_processor_kwargs: dict[str, Any] | None = Field(
|
||||
default=None,
|
||||
description=("Additional kwargs to pass to the HF processor."),
|
||||
)
|
||||
# --8<-- [end:score-extra-params]
|
||||
|
||||
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
|
||||
encoder_config = model_config.encoder_config or {}
|
||||
|
||||
@@ -106,13 +99,6 @@ class RerankRequest(PoolingBasicRequestMixin, ClassifyRequestMixin):
|
||||
documents: ScoreInputs
|
||||
top_n: int = Field(default_factory=lambda: 0)
|
||||
|
||||
# --8<-- [start:rerank-extra-params]
|
||||
mm_processor_kwargs: dict[str, Any] | None = Field(
|
||||
default=None,
|
||||
description=("Additional kwargs to pass to the HF processor."),
|
||||
)
|
||||
# --8<-- [end:rerank-extra-params]
|
||||
|
||||
def build_tok_params(self, model_config: ModelConfig) -> TokenizeParams:
|
||||
encoder_config = model_config.encoder_config or {}
|
||||
|
||||
|
||||
7
vllm/utils/print_utils.py
Normal file
7
vllm/utils/print_utils.py
Normal file
@@ -0,0 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
|
||||
def print_embeddings(embeds: list[float]):
|
||||
embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
|
||||
print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
|
||||
Reference in New Issue
Block a user