[Bugfix] Standardize getting number of image patches/tokens (#34358)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-02-13 12:47:01 +08:00
parent 6afa587d31
commit 372b2e762a
29 changed files with 319 additions and 331 deletions
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -642,13 +642,9 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
        image_height: int,
        num_frames: int = 2,
        do_resize: bool = True,
-        image_processor: Qwen2VLImageProcessorFast | Qwen3VLVideoProcessor | None,
+        image_processor: Qwen2VLImageProcessorFast | Qwen3VLVideoProcessor,
+        mm_kwargs: Mapping[str, object],
    ) -> tuple[ImageSize, int]:
-        if image_processor is None and num_frames > 1:
-            image_processor = self.get_video_processor()
-        elif image_processor is None:
-            image_processor = self.get_image_processor()
-
        is_video = isinstance(image_processor, Qwen3VLVideoProcessor)

        hf_config = self.get_hf_config()
@@ -657,6 +653,9 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
        merge_size = vision_config.spatial_merge_size
        temporal_patch_size = vision_config.temporal_patch_size

+        mm_kwargs = self.ctx.get_merged_mm_kwargs(mm_kwargs)
+        size = mm_kwargs.get("size", image_processor.size)
+
        if do_resize:
            if is_video:
                smart_resize = video_smart_resize
@@ -667,12 +666,13 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
            else:
                smart_resize = image_smart_resize
                extra_kwargs = {}
+
            resized_height, resized_width = smart_resize(
                height=image_height,
                width=image_width,
                factor=patch_size * merge_size,
-                min_pixels=image_processor.size["shortest_edge"],
-                max_pixels=image_processor.size["longest_edge"],
+                min_pixels=size["shortest_edge"],
+                max_pixels=size["longest_edge"],
                **extra_kwargs,
            )
            preprocessed_size = ImageSize(width=resized_width, height=resized_height)
@@ -720,7 +720,8 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
            image_width=target_width,
            image_height=target_height,
            num_frames=2,
-            image_processor=None,
+            image_processor=video_processor,
+            mm_kwargs={},
        )
        return num_video_soft_tokens

@@ -846,6 +847,7 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):
            image_height=target_video_height,
            num_frames=target_num_frames,
            image_processor=video_processor,
+            mm_kwargs={},
        )
        # NOTE: we need to do this check here since Qwen3-VL resizes video
        # frames depending on how many frames there are.