[Refactor] Define MM data parser in processing info instead of processor itself (#33260)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2026-01-29 13:55:17 +08:00
parent 07ea184f00
commit 51550179fc
34 changed files with 399 additions and 347 deletions
--- a/vllm/model_executor/models/qwen3_vl.py
+++ b/vllm/model_executor/models/qwen3_vl.py
@@ -81,7 +81,7 @@ from vllm.multimodal.inputs import (
    PlaceholderRange,
    VideoItem,
 )
-from vllm.multimodal.parse import ImageSize, MultiModalDataItems, MultiModalDataParser
+from vllm.multimodal.parse import ImageSize, MultiModalDataItems
 from vllm.multimodal.processing import (
    BaseDummyInputsBuilder,
    BaseMultiModalProcessor,
@@ -624,6 +624,13 @@ class Qwen3VLProcessingInfo(Qwen2VLProcessingInfo):
    def get_video_processor(self, **kwargs: object) -> Qwen3VLVideoProcessor:
        return self.get_hf_processor(**kwargs).video_processor

+    def get_data_parser(self):
+        return Qwen2VLMultiModalDataParser(
+            self.get_hf_config().vision_config.spatial_merge_size,
+            video_needs_metadata=True,
+            expected_hidden_size=self._get_expected_hidden_size(),
+        )
+
    def _get_vision_info(
        self,
        *,
@@ -901,12 +908,6 @@ class Qwen3VLDummyInputsBuilder(BaseDummyInputsBuilder[Qwen3VLProcessingInfo]):


 class Qwen3VLMultiModalProcessor(BaseMultiModalProcessor[Qwen3VLProcessingInfo]):
-    def _get_data_parser(self) -> MultiModalDataParser:
-        return Qwen2VLMultiModalDataParser(
-            self.info.get_hf_config().vision_config.spatial_merge_size,
-            video_needs_metadata=True,
-        )
-
    def _call_hf_processor(
        self,
        prompt: str,