diff --git a/tests/lora/test_qwenvl.py b/tests/lora/test_qwenvl.py index 741e1acee..5f8fc26c1 100644 --- a/tests/lora/test_qwenvl.py +++ b/tests/lora/test_qwenvl.py @@ -2,6 +2,9 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from dataclasses import dataclass +from packaging.version import Version +from transformers import __version__ as TRANSFORMERS_VERSION + import vllm from vllm.assets.image import ImageAsset from vllm.lora.request import LoRARequest @@ -18,15 +21,25 @@ class TestConfig: enable_tower_connector_lora: bool = False max_model_len: int = 8192 gpu_memory_utilization: float = 0.85 - mm_processor_kwargs: dict[str, int] | None = None + mm_processor_kwargs: dict[str, object] | None = None mm_processor_cache_gb: float = 4 def __post_init__(self): if self.mm_processor_kwargs is None: - self.mm_processor_kwargs = { - "min_pixels": 28 * 28, - "max_pixels": 1280 * 28 * 28, - } + # There is a bug in transformers v4 where size is ignored by + # `Qwen2VLProcessor.__call__` + if Version(TRANSFORMERS_VERSION) < Version("5.2.0"): + self.mm_processor_kwargs = { + "min_pixels": 28 * 28, + "max_pixels": 1280 * 28 * 28, + } + else: + self.mm_processor_kwargs = { + "size": { + "shortest_edge": 28 * 28, + "longest_edge": 1280 * 28 * 28, + } + } class Qwen2VLTester: diff --git a/tests/models/multimodal/processing/test_gemma3.py b/tests/models/multimodal/processing/test_gemma3.py index 884702cab..2b4c21369 100644 --- a/tests/models/multimodal/processing/test_gemma3.py +++ b/tests/models/multimodal/processing/test_gemma3.py @@ -150,8 +150,11 @@ class TestGemma3nAudioTensorLogic: @pytest.mark.parametrize("model_id", [GEMMA3_MODEL_ID]) +@pytest.mark.parametrize("mm_processor_kwargs", [{}]) def test_get_image_size_with_most_features( - image_assets: ImageTestAssets, model_id: str + image_assets: ImageTestAssets, + model_id: str, + mm_processor_kwargs: dict[str, object], ): ctx = build_model_context( model_id, @@ -160,15 +163,14 @@ def test_get_image_size_with_most_features( ) processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) - hf_processor_mm_kwargs: dict[str, object] = {} - hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) + hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs) max_image_size = processor.info.get_image_size_with_most_features() max_tokens = processor.info.get_num_image_tokens( image_width=max_image_size.width, image_height=max_image_size.height, processor=hf_processor, - mm_kwargs=hf_processor_mm_kwargs, + mm_kwargs=mm_processor_kwargs, ) prompt = "" @@ -179,7 +181,7 @@ def test_get_image_size_with_most_features( processed_inputs = processor( prompt, mm_items=processor.info.parse_mm_data(mm_data), - hf_processor_mm_kwargs=hf_processor_mm_kwargs, + hf_processor_mm_kwargs=mm_processor_kwargs, ) mm_kwargs_data = processed_inputs["mm_kwargs"].get_data() num_patches_tensor = mm_kwargs_data["num_patches"] diff --git a/tests/models/multimodal/processing/test_qwen2_vl.py b/tests/models/multimodal/processing/test_qwen2_vl.py index fb28d0c74..ad5e82945 100644 --- a/tests/models/multimodal/processing/test_qwen2_vl.py +++ b/tests/models/multimodal/processing/test_qwen2_vl.py @@ -2,6 +2,8 @@ # SPDX-FileCopyrightText: Copyright contributors to the vLLM project import pytest +from packaging.version import Version +from transformers import __version__ as TRANSFORMERS_VERSION from vllm.multimodal import MULTIMODAL_REGISTRY @@ -15,6 +17,16 @@ from ...utils import build_model_context [ ({}, 1426, (5704, 1176)), ({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)), + ( + { + "size": { + "shortest_edge": 64**2, + "longest_edge": 512**2, + }, + }, + 330, + (1320, 1176), + ), ], ) @pytest.mark.parametrize("num_imgs", [1, 2]) @@ -29,6 +41,12 @@ def test_processor_override( kwargs_on_init: bool, ): """Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly.""" + if ( + Version(TRANSFORMERS_VERSION) < Version("5.2.0") + and "size" in mm_processor_kwargs + ): + pytest.skip("`size` ignored by `Qwen2VLProcessor.__call__`") + ctx = build_model_context( model_id, mm_processor_kwargs=mm_processor_kwargs if kwargs_on_init else None, @@ -60,21 +78,34 @@ def test_processor_override( @pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"]) -@pytest.mark.parametrize("max_pixels", [1280 * 28 * 28, 1283 * 28 * 28]) +@pytest.mark.parametrize( + "mm_processor_kwargs", + [ + {"min_pixels": 28 * 28, "max_pixels": 1280 * 28 * 28}, + {"min_pixels": 28 * 28, "max_pixels": 1283 * 28 * 28}, + {"size": {"shortest_edge": 28 * 28, "longest_edge": 1280 * 28 * 28}}, + {"size": {"shortest_edge": 28 * 28, "longest_edge": 1283 * 28 * 28}}, + ], +) def test_get_image_size_with_most_features( image_assets: ImageTestAssets, model_id: str, - max_pixels: int, + mm_processor_kwargs: dict[str, object], ): + if ( + Version(TRANSFORMERS_VERSION) < Version("5.2.0") + and "size" in mm_processor_kwargs + ): + pytest.skip("`size` ignored by `Qwen2VLProcessor.__call__`") + ctx = build_model_context( model_id, - mm_processor_kwargs={"max_pixels": max_pixels}, + mm_processor_kwargs=mm_processor_kwargs, limit_mm_per_prompt={"image": 1}, ) processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config) - hf_processor_mm_kwargs: dict[str, object] = {} - hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs) + hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs) merge_size = processor.info.get_hf_config().vision_config.spatial_merge_size max_image_size = processor.info.get_image_size_with_most_features() @@ -82,7 +113,7 @@ def test_get_image_size_with_most_features( image_width=max_image_size.width, image_height=max_image_size.height, image_processor=hf_processor.image_processor, - mm_kwargs=hf_processor_mm_kwargs, + mm_kwargs=mm_processor_kwargs, ) prompt = "<|vision_start|><|image_pad|><|vision_end|>" @@ -91,7 +122,7 @@ def test_get_image_size_with_most_features( processed_inputs = processor( prompt, mm_items=processor.info.parse_mm_data(mm_data), - hf_processor_mm_kwargs=hf_processor_mm_kwargs, + hf_processor_mm_kwargs=mm_processor_kwargs, ) grid_thw = processed_inputs["mm_kwargs"].get_data()["image_grid_thw"].tolist() t, h, w = grid_thw[0] diff --git a/vllm/model_executor/models/ernie45_vl.py b/vllm/model_executor/models/ernie45_vl.py index 1df4adfac..edf4c2c8d 100644 --- a/vllm/model_executor/models/ernie45_vl.py +++ b/vllm/model_executor/models/ernie45_vl.py @@ -829,16 +829,31 @@ class Ernie4_5_VLProcessingInfo(BaseProcessingInfo): spatial_conv_size = hf_config.spatial_conv_size temporal_conv_size = hf_config.temporal_conv_size + if self.ctx.model_config.trust_remote_code: + # Defined in HF Hub repo + min_pixels_key = "min_pixels" + max_pixels_key = "max_pixels" + else: + # Defined in Transformers library (requires v5.0 or above) + min_pixels_key = "shortest_edge" + max_pixels_key = "longest_edge" + mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs) - size = mm_kwargs.get("size", image_processor.size) + size = image_processor.size + if override_size := mm_kwargs.get("size"): + size = size | override_size + if (override_min_pixels := mm_kwargs.get("min_pixels")) is not None: + size = size | {min_pixels_key: override_min_pixels} + if (override_max_pixels := mm_kwargs.get("max_pixels")) is not None: + size = size | {max_pixels_key: override_max_pixels} if do_resize: resized_height, resized_width = smart_resize( height=image_height, width=image_width, factor=patch_size * spatial_conv_size, - min_pixels=size["min_pixels"], - max_pixels=size["max_pixels"], + min_pixels=size[min_pixels_key], + max_pixels=size[max_pixels_key], ) preprocessed_size = ImageSize(width=resized_width, height=resized_height) else: diff --git a/vllm/model_executor/models/hunyuan_vision.py b/vllm/model_executor/models/hunyuan_vision.py index 3f2d0e7dd..b6fda25dd 100644 --- a/vllm/model_executor/models/hunyuan_vision.py +++ b/vllm/model_executor/models/hunyuan_vision.py @@ -636,7 +636,13 @@ class HunYuanVLProcessingInfo(BaseProcessingInfo): spatial_merge_size = vision_config.spatial_merge_size mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs) - size = mm_kwargs.get("size", image_processor.size) + size = image_processor.size + if override_size := mm_kwargs.get("size"): + size = size | override_size + if (override_min_pixels := mm_kwargs.get("min_pixels")) is not None: + size = size | {"shortest_edge": override_min_pixels} + if (override_max_pixels := mm_kwargs.get("max_pixels")) is not None: + size = size | {"longest_edge": override_max_pixels} if do_resize: resized_height, resized_width = smart_resize( diff --git a/vllm/model_executor/models/keye.py b/vllm/model_executor/models/keye.py index 2cb7dc425..4c43e413f 100644 --- a/vllm/model_executor/models/keye.py +++ b/vllm/model_executor/models/keye.py @@ -1021,7 +1021,13 @@ class KeyeProcessingInfo(BaseProcessingInfo): temporal_patch_size = 1 mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs) - size = mm_kwargs.get("size", image_processor.size) + size = image_processor.size + if override_size := mm_kwargs.get("size"): + size = size | override_size + if (override_min_pixels := mm_kwargs.get("min_pixels")) is not None: + size = size | {"min_pixels": override_min_pixels} + if (override_max_pixels := mm_kwargs.get("max_pixels")) is not None: + size = size | {"max_pixels": override_max_pixels} if do_resize: resized_height, resized_width = smart_resize( diff --git a/vllm/model_executor/models/paddleocr_vl.py b/vllm/model_executor/models/paddleocr_vl.py index 6c9304101..35132e724 100644 --- a/vllm/model_executor/models/paddleocr_vl.py +++ b/vllm/model_executor/models/paddleocr_vl.py @@ -155,15 +155,30 @@ class PaddleOCRVLProcessingInfo(BaseProcessingInfo): patch_size = vision_config.patch_size merge_size = vision_config.spatial_merge_size + if self.ctx.model_config.trust_remote_code: + # Defined in HF Hub repo + min_pixels_key = "min_pixels" + max_pixels_key = "max_pixels" + else: + # Defined in Transformers library (requires v5.0 or above) + min_pixels_key = "shortest_edge" + max_pixels_key = "longest_edge" + mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs) - size = mm_kwargs.get("size", image_processor.size) + size = image_processor.size + if override_size := mm_kwargs.get("size"): + size = size | override_size + if (override_min_pixels := mm_kwargs.get("min_pixels")) is not None: + size = size | {min_pixels_key: override_min_pixels} + if (override_max_pixels := mm_kwargs.get("max_pixels")) is not None: + size = size | {max_pixels_key: override_max_pixels} resized_height, resized_width = smart_resize( height=image_height, width=image_width, factor=patch_size * merge_size, - min_pixels=size["min_pixels"], - max_pixels=size["max_pixels"], + min_pixels=size[min_pixels_key], + max_pixels=size[max_pixels_key], ) preprocessed_size = ImageSize(width=resized_width, height=resized_height) diff --git a/vllm/model_executor/models/qwen2_vl.py b/vllm/model_executor/models/qwen2_vl.py index eed559bcb..c4c71faf3 100644 --- a/vllm/model_executor/models/qwen2_vl.py +++ b/vllm/model_executor/models/qwen2_vl.py @@ -843,7 +843,13 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): temporal_patch_size = vision_config.temporal_patch_size mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs) - size = mm_kwargs.get("size", image_processor.size) + size = image_processor.size + if override_size := mm_kwargs.get("size"): + size = size | override_size + if (override_min_pixels := mm_kwargs.get("min_pixels")) is not None: + size = size | {"shortest_edge": override_min_pixels} + if (override_max_pixels := mm_kwargs.get("max_pixels")) is not None: + size = size | {"longest_edge": override_max_pixels} if do_resize: resized_height, resized_width = smart_resize( @@ -930,7 +936,14 @@ class Qwen2VLProcessingInfo(BaseProcessingInfo): image_processor = self.get_image_processor() mm_kwargs = self.ctx.get_merged_mm_kwargs({}) - size = mm_kwargs.get("size", image_processor.size) + size = image_processor.size + if override_size := mm_kwargs.get("size"): + size = size | override_size + if (override_min_pixels := mm_kwargs.get("min_pixels")) is not None: + size = size | {"shortest_edge": override_min_pixels} + if (override_max_pixels := mm_kwargs.get("max_pixels")) is not None: + size = size | {"longest_edge": override_max_pixels} + max_pixels = size["longest_edge"] unit = patch_size * merge_size diff --git a/vllm/model_executor/models/qwen3_vl.py b/vllm/model_executor/models/qwen3_vl.py index 1a017e561..304553ed3 100644 --- a/vllm/model_executor/models/qwen3_vl.py +++ b/vllm/model_executor/models/qwen3_vl.py @@ -647,7 +647,13 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo): temporal_patch_size = vision_config.temporal_patch_size mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs) - size = mm_kwargs.get("size", image_processor.size) + size = image_processor.size + if override_size := mm_kwargs.get("size"): + size = size | override_size + if (override_min_pixels := mm_kwargs.get("min_pixels")) is not None: + size = size | {"shortest_edge": override_min_pixels} + if (override_max_pixels := mm_kwargs.get("max_pixels")) is not None: + size = size | {"longest_edge": override_max_pixels} if do_resize: if is_video: