diff --git a/examples/pooling/embed/vision_embedding_offline.py b/examples/pooling/embed/vision_embedding_offline.py index ef272bade..cfce047dc 100644 --- a/examples/pooling/embed/vision_embedding_offline.py +++ b/examples/pooling/embed/vision_embedding_offline.py @@ -12,6 +12,8 @@ on HuggingFace model repository. import argparse from dataclasses import asdict +from PIL.Image import Image + from vllm import LLM, EngineArgs from vllm.multimodal.utils import fetch_image @@ -20,17 +22,42 @@ text = "A cat standing in the snow." multi_modal_data = {"image": fetch_image(image_url)} -def print_embeddings(embeds): +def print_embeddings(embeds: list[float]): embeds_trimmed = (str(embeds[:4])[:-1] + ", ...]") if len(embeds) > 4 else embeds print(f"Embeddings: {embeds_trimmed} (size={len(embeds)})") def run_qwen3_vl(): + try: + from qwen_vl_utils import smart_resize + except ModuleNotFoundError: + print( + "WARNING: `qwen-vl-utils` not installed, input images will not " + "be automatically resized. This can cause different results " + "comparing with HF repo's example. " + "You can enable this functionality by `pip install qwen-vl-utils`." + ) + smart_resize = None + + if smart_resize is not None: + + def post_process_image(image: Image) -> Image: + width, height = image.size + resized_height, resized_width = smart_resize( + height, + width, + factor=32, + ) + return image.resize((resized_width, resized_height)) + + multi_modal_data["image"] = post_process_image(multi_modal_data["image"]) + engine_args = EngineArgs( model="Qwen/Qwen3-VL-Embedding-2B", runner="pooling", max_model_len=8192, limit_mm_per_prompt={"image": 1}, + mm_processor_kwargs={"do_resize": False} if smart_resize is not None else None, ) default_instruction = "Represent the user's input." image_placeholder = "<|vision_start|><|image_pad|><|vision_end|>"