[VLM] Enable tokenized inputs for merged multi-modal processor (#11900)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-10 11:24:00 +08:00
parent c3cf54dda4
commit b844b99ad3
12 changed files with 207 additions and 77 deletions
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -481,11 +481,11 @@ class Phi3VMultiModalProcessor(BaseMultiModalProcessor[Phi3VProcessingInfo]):

    def apply(
        self,
-        prompt_text: str,
+        prompt: Union[str, list[int]],
        mm_data: MultiModalDataDict,
        hf_processor_mm_kwargs: Mapping[str, object],
    ) -> MultiModalInputsV2:
-        result = super().apply(prompt_text, mm_data, hf_processor_mm_kwargs)
+        result = super().apply(prompt, mm_data, hf_processor_mm_kwargs)

        # Only <|image|> tokens should be considered as placeholders,
        # so we ignore the trailing bos_token_id