[VLM] Merged multi-modal processor for InternVL-based models (#12553)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com>
2025-02-04 16:44:52 +08:00
parent 96b23621c1
commit d1ca7df84d
34 changed files with 1469 additions and 1021 deletions
--- a/vllm/model_executor/models/qwen.py
+++ b/vllm/model_executor/models/qwen.py
@@ -779,7 +779,11 @@ class QWenVLProcessingInfo(BaseProcessingInfo):
    def get_supported_mm_limits(self) -> Mapping[str, Optional[int]]:
        return {"image": None}

-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
        return {"image": self.get_num_image_tokens()}

    def get_num_image_tokens(self) -> int:
@@ -799,13 +803,13 @@ class QWenVLDummyInputsBuilder(BaseDummyInputsBuilder[QWenVLProcessingInfo]):

        vision_config = hf_config.visual

-        max_image_size = vision_config["image_size"]
+        target_width = target_height = vision_config["image_size"]
        num_images = mm_counts.get("image", 0)

        mm_data = {
            "image":
-            self._get_dummy_images(width=max_image_size,
-                                   height=max_image_size,
+            self._get_dummy_images(width=target_width,
+                                   height=target_height,
                                   num_images=num_images)
        }