[Model][VLM] Add multi-video support for LLaVA-Onevision (#8905)

Co-authored-by: litianjian <litianjian@bytedance.com> Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
2024-10-29 02:04:10 +08:00
parent 8b0e4f2ad7
commit 5f8d8075f9
5 changed files with 123 additions and 162 deletions
--- a/vllm/multimodal/video.py
+++ b/vllm/multimodal/video.py
@@ -56,15 +56,14 @@ class VideoPlugin(ImagePlugin):
    ) -> MultiModalInputs:
        model_config = ctx.model_config

-        # single video input as np.ndarray
-        if isinstance(data, np.ndarray):
+        if isinstance(data, np.ndarray) or is_list_of(data, np.ndarray):
            video_processor = self._get_hf_video_processor(
                model_config,
                mm_processor_kwargs,
            )
            if video_processor is None:
                raise RuntimeError("No HuggingFace processor is available "
-                                   "to process the image object")
+                                   "to process the video object")
            try:
                # NOTE: Similar to image; it may be a good idea to filter and
                # pass mm_processor_kwargs here too, but for now we don't to
@@ -72,13 +71,10 @@ class VideoPlugin(ImagePlugin):
                # signatures of the processor don't align
                batch_data = video_processor(data, return_tensors="pt").data
            except Exception:
-                logger.error("Failed to process image (%s)", data)
+                logger.error("Failed to process video (%s)", data)
                raise

            return MultiModalInputs(batch_data)
-        elif is_list_of(data, np.ndarray):
-            raise NotImplementedError(
-                "Multi video for a prompt is not supported yet")

        raise TypeError(f"Invalid video type: {type(data)}")