[VLM] Merged multi-modal processors for LLaVA-NeXT-Video and LLaVA-OneVision (#11717)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-04 19:40:53 +08:00
parent 300acb8347
commit eed11ebee9
31 changed files with 1104 additions and 973 deletions
--- a/vllm/model_executor/models/blip2.py
+++ b/vllm/model_executor/models/blip2.py
@@ -405,7 +405,7 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor):
        hf_config = self.ctx.get_hf_config(Blip2Config)
        return hf_config.num_query_tokens

-    def get_mm_max_tokens_per_item(self) -> Mapping[str, int]:
+    def get_mm_max_tokens_per_item(self, seq_len: int) -> Mapping[str, int]:
        return {"image": self._get_num_image_tokens()}

    def _get_hf_processor(self) -> Blip2Processor:
@@ -457,8 +457,9 @@ class Blip2MultiModalProcessor(BaseMultiModalProcessor):

        return result

-    def _get_dummy_mm_inputs(
+    def _get_dummy_processor_inputs(
        self,
+        seq_len: int,
        mm_counts: Mapping[str, int],
    ) -> ProcessorInputs:
        hf_config = self.ctx.get_hf_config(Blip2Config)