[V1][Molmo] Fix get_multimodal_embeddings() in molmo.py (#14161)

2025-03-04 07:43:59 -08:00
parent c8525f06fc
commit b3cf368d79
22 changed files with 249 additions and 150 deletions
--- a/vllm/model_executor/models/llava_next_video.py
+++ b/vllm/model_executor/models/llava_next_video.py
@@ -420,7 +420,9 @@ class LlavaNextVideoForConditionalGeneration(nn.Module, SupportsMultiModal,
            raise ValueError(
                f"Unsupported type of video input {type(video_pixels)}")

-    def get_multimodal_embeddings(self, **kwargs) -> Optional[NestedTensors]:
+    def get_multimodal_embeddings(
+        self, **kwargs
+    ) -> Union[list[torch.Tensor], torch.Tensor, tuple[torch.Tensor, ...]]:
        video_input = self._parse_and_validate_video_input(**kwargs)
        if video_input is None:
            return None