[Misc] Algin Qwen3-VL-embedding image example outputs with HF repo example (#33419)
Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn>
This commit is contained in:
@@ -12,6 +12,8 @@ on HuggingFace model repository.
|
|||||||
import argparse
|
import argparse
|
||||||
from dataclasses import asdict
|
from dataclasses import asdict
|
||||||
|
|
||||||
|
from PIL.Image import Image
|
||||||
|
|
||||||
from vllm import LLM, EngineArgs
|
from vllm import LLM, EngineArgs
|
||||||
from vllm.multimodal.utils import fetch_image
|
from vllm.multimodal.utils import fetch_image
|
||||||
|
|
||||||
@@ -20,17 +22,42 @@ text = "A cat standing in the snow."
|
|||||||
multi_modal_data = {"image": fetch_image(image_url)}
|
multi_modal_data = {"image": fetch_image(image_url)}
|
||||||
|
|
||||||
|
|
||||||
def print_embeddings(embeds):
|
def print_embeddings(embeds: list[float]):
|
||||||
embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
|
embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds
|
||||||
print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
|
print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})")
|
||||||
|
|
||||||
|
|
||||||
def run_qwen3_vl():
|
def run_qwen3_vl():
|
||||||
|
try:
|
||||||
|
from qwen_vl_utils import smart_resize
|
||||||
|
except ModuleNotFoundError:
|
||||||
|
print(
|
||||||
|
"WARNING: `qwen-vl-utils` not installed, input images will not "
|
||||||
|
"be automatically resized. This can cause different results "
|
||||||
|
"comparing with HF repo's example. "
|
||||||
|
"You can enable this functionality by `pip install qwen-vl-utils`."
|
||||||
|
)
|
||||||
|
smart_resize = None
|
||||||
|
|
||||||
|
if smart_resize is not None:
|
||||||
|
|
||||||
|
def post_process_image(image: Image) -> Image:
|
||||||
|
width, height = image.size
|
||||||
|
resized_height, resized_width = smart_resize(
|
||||||
|
height,
|
||||||
|
width,
|
||||||
|
factor=32,
|
||||||
|
)
|
||||||
|
return image.resize((resized_width, resized_height))
|
||||||
|
|
||||||
|
multi_modal_data["image"] = post_process_image(multi_modal_data["image"])
|
||||||
|
|
||||||
engine_args = EngineArgs(
|
engine_args = EngineArgs(
|
||||||
model="Qwen/Qwen3-VL-Embedding-2B",
|
model="Qwen/Qwen3-VL-Embedding-2B",
|
||||||
runner="pooling",
|
runner="pooling",
|
||||||
max_model_len=8192,
|
max_model_len=8192,
|
||||||
limit_mm_per_prompt={"image": 1},
|
limit_mm_per_prompt={"image": 1},
|
||||||
|
mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None,
|
||||||
)
|
)
|
||||||
default_instruction = "Represent the user's input."
|
default_instruction = "Represent the user's input."
|
||||||
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
|
image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||||
|
|||||||
Reference in New Issue
Block a user