[StepVL] support close img patch (#32923)

Signed-off-by: luotingdan <luotingdan@stepfun.com> Signed-off-by: ltd0924 <32387785+ltd0924@users.noreply.github.com> Co-authored-by: luotingdan <luotingdan@stepfun.com>
2026-01-26 12:56:39 +08:00
parent 566cdb6cfb
commit 105d104576
1 changed files with 10 additions and 5 deletions
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -142,8 +142,11 @@ class Step3VisionProcessor:


 class ImagePatcher:
+    def __init__(self, enable_patch: bool = True) -> None:
+        self.enable_patch = enable_patch
+
    def determine_window_size(self, long: int, short: int) -> int:
-        if long <= 728:
+        if long < 728:
            return short if long / short > 1.5 else 0
        return min(short, 504) if long / short > 4 else 504

@@ -241,7 +244,7 @@ class ImagePatcher:
        window_size = self.determine_window_size(
            max(img_height, img_width), min(img_height, img_width)
        )
-        if window_size == 0:
+        if window_size == 0 or not self.enable_patch:
            return 0, 0
        else:
            img_width, img_height = self.get_image_size_for_crop(
@@ -277,7 +280,7 @@ class ImagePatcher:
            max(new_img_height, new_img_width), min(new_img_height, new_img_width)
        )

-        if window_size == 0:
+        if window_size == 0 or not self.enable_patch:
            return img, [], None
        else:
            new_img_width, new_img_height = self.get_image_size_for_crop(
@@ -327,7 +330,6 @@ class Step3VLProcessor:

        self.config = config
        self.tokenizer = tokenizer
-
        self.image_size = 728
        self.patch_size = 504
        self.image_preprocessor = Step3VisionProcessor(
@@ -340,7 +342,10 @@ class Step3VLProcessor:
        self.image_feature_placeholder = self.image_token * self.num_image_feature_size
        self.patch_feature_placeholder = self.image_token * self.num_patch_feature_size

-        self.patcher = ImagePatcher()
+        # Respect vision config switch to enable/disable patch extraction.
+        # For video understanding, it's preferable to disable patch.
+        enable_patch = getattr(self.config.vision_config, "enable_patch", True)
+        self.patcher = ImagePatcher(enable_patch=enable_patch)

    @property
    def image_token_id(self) -> int: