[Misc] Move processors to transformers_utils (#35953)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-03-09 11:31:39 +08:00
parent bd2659a566
commit d62856b928
13 changed files with 507 additions and 595 deletions
--- a/vllm/model_executor/models/qwen_vl.py
+++ b/vllm/model_executor/models/qwen_vl.py
@@ -14,11 +14,7 @@ from typing import Annotated, Literal, TypeAlias
 import regex as re
 import torch
 from torch import nn
-from torchvision import transforms
-from torchvision.transforms import InterpolationMode
-from transformers import BatchFeature, PretrainedConfig, PreTrainedTokenizer, TensorType
-from transformers.image_utils import ImageInput
-from transformers.tokenization_utils_base import TextInput
+from transformers import BatchFeature

 from vllm.config import VllmConfig
 from vllm.config.multimodal import BaseDummyOptions
@@ -48,6 +44,7 @@ from vllm.multimodal.processing import (
    PromptUpdateDetails,
 )
 from vllm.sequence import IntermediateTensors
+from vllm.transformers_utils.processors.qwen_vl import QwenVLProcessor
 from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import (
@@ -434,96 +431,16 @@ class QwenVLModel(QWenModel):
        )


-class QwenVLProcessor:
-    """
-    This model doesn't define its own HF processor,
-    so we implement our own one here.
-
-    We call the wrapped tokenizer to automatically insert image pad tokens:
-    https://huggingface.co/Qwen/Qwen-VL/blob/main/tokenization_qwen.py#L245
-
-    The image processor is defined here:
-    https://huggingface.co/Qwen/Qwen-VL/blob/main/visual.py#L354
-    """
-
-    def __init__(
-        self,
-        config: PretrainedConfig,
-        tokenizer: PreTrainedTokenizer,
-    ) -> None:
-        super().__init__()
-
-        self.config = config
-        self.tokenizer = tokenizer
-
+class QwenVLProcessingInfo(BaseProcessingInfo):
+    def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
+        config = self.get_hf_config()
        vision_config = config.visual
        image_size = vision_config["image_size"]

-        self.image_transform = transforms.Compose(
-            [
-                transforms.Resize(
-                    (image_size, image_size),
-                    interpolation=InterpolationMode.BICUBIC,
-                ),
-                transforms.ToTensor(),
-                transforms.Normalize(
-                    mean=(0.48145466, 0.4578275, 0.40821073),
-                    std=(0.26862954, 0.26130258, 0.27577711),
-                ),
-            ]
-        )
-
-    @property
-    def image_start_tag(self) -> str:
-        return self.tokenizer.image_start_tag  # type: ignore
-
-    @property
-    def image_end_tag(self) -> str:
-        return self.tokenizer.image_end_tag  # type: ignore
-
-    @property
-    def image_pad_tag(self) -> str:
-        return self.tokenizer.image_pad_tag  # type: ignore
-
-    def __call__(
-        self,
-        text: TextInput | list[TextInput] | None = None,
-        images: ImageInput | list[ImageInput] | None = None,
-        return_tensors: str | TensorType | None = None,
-    ) -> BatchFeature:
-        if text is None:
-            text = []
-        if not isinstance(text, list):
-            text = [text]
-        if images is None:
-            images = []
-        if not isinstance(images, list):
-            images = [images]
-
-        text_inputs = self.tokenizer(text)
-
-        if len(images) == 0:
-            image_inputs = {}
-        else:
-            pixel_values = [self.image_transform(image) for image in images]
-            image_inputs = {"pixel_values": torch.stack(pixel_values)}
-
-        return BatchFeature(
-            {
-                **text_inputs,
-                **image_inputs,
-            },
-            tensor_type=return_tensors,
-        )
-
-
-class QwenVLProcessingInfo(BaseProcessingInfo):
-    def get_hf_processor(self, **kwargs: object) -> QwenVLProcessor:
        return self.ctx.init_processor(
            QwenVLProcessor,
-            config=self.get_hf_config(),
            tokenizer=self.get_tokenizer(),
-            **kwargs,
+            **{**kwargs, "image_size": image_size},
        )

    def get_supported_mm_limits(self) -> Mapping[str, int | None]: