[Model][VLM] Add multi-video support for LLaVA-Onevision (#8905)

Co-authored-by: litianjian <litianjian@bytedance.com>
Co-authored-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
litianjian
2024-10-29 02:04:10 +08:00
committed by GitHub
parent 8b0e4f2ad7
commit 5f8d8075f9
5 changed files with 123 additions and 162 deletions

View File

@@ -56,15 +56,14 @@ class VideoPlugin(ImagePlugin):
) -> MultiModalInputs:
model_config = ctx.model_config
# single video input as np.ndarray
if isinstance(data, np.ndarray):
if isinstance(data, np.ndarray) or is_list_of(data, np.ndarray):
video_processor = self._get_hf_video_processor(
model_config,
mm_processor_kwargs,
)
if video_processor is None:
raise RuntimeError("No HuggingFace processor is available "
"to process the image object")
"to process the video object")
try:
# NOTE: Similar to image; it may be a good idea to filter and
# pass mm_processor_kwargs here too, but for now we don't to
@@ -72,13 +71,10 @@ class VideoPlugin(ImagePlugin):
# signatures of the processor don't align
batch_data = video_processor(data, return_tensors="pt").data
except Exception:
logger.error("Failed to process image (%s)", data)
logger.error("Failed to process video (%s)", data)
raise
return MultiModalInputs(batch_data)
elif is_list_of(data, np.ndarray):
raise NotImplementedError(
"Multi video for a prompt is not supported yet")
raise TypeError(f"Invalid video type: {type(data)}")