[VLM] Merged multi-modal processor for InternVL-based models (#12553)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com>
2025-02-04 16:44:52 +08:00
parent 96b23621c1
commit d1ca7df84d
34 changed files with 1469 additions and 1021 deletions
--- a/vllm/model_executor/models/deepseek_vl2.py
+++ b/vllm/model_executor/models/deepseek_vl2.py
@@ -165,7 +165,11 @@ class DeepseekVL2ProcessingInfo(BaseProcessingInfo):
                                image_width=x[1], image_height=x[0]))
        return ImageSize(width=width, height=height)

-    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(
+        self,
+        seq_len: int,
+        mm_counts: Mapping[str, int],
+    ) -> Mapping[str, int]:
        max_image_size = self.get_image_size_with_most_features()
        max_image_tokens = self.get_num_image_tokens(
            image_height=max_image_size.height,