[VLM] Merged multi-modal processor for LLaVA-NeXT (#11682)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
2025-01-03 00:39:27 +08:00
parent b6087a6bee
commit 8c38ee7007
14 changed files with 609 additions and 555 deletions
--- a/vllm/model_executor/models/utils.py
+++ b/vllm/model_executor/models/utils.py
@@ -373,7 +373,7 @@ def embed_multimodal(
    input_ids: torch.Tensor,
    multimodal_token_id: int,
    get_text_embeds: Callable[[torch.Tensor], torch.Tensor],
-    multimodal_embeds: Union[torch.Tensor, List[torch.Tensor]],
+    multimodal_embeds: NestedTensors,
 ) -> torch.Tensor:
    """
    Embed token IDs and multimodal inputs and combine their embeddings.