Files
vllm/tests/multimodal/media/test_video.py
2026-01-15 11:52:12 +00:00

238 lines
8.9 KiB
Python

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
import numpy as np
import numpy.typing as npt
import pytest
from PIL import Image
from vllm.assets.base import get_vllm_public_assets
from vllm.assets.video import video_to_ndarrays, video_to_pil_images_list
from vllm.multimodal.media import ImageMediaIO, VideoMediaIO
from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader
from ..utils import cosine_similarity, create_video_from_image, normalize_image
pytestmark = pytest.mark.cpu_test
ASSETS_DIR = Path(__file__).parent.parent / "assets"
assert ASSETS_DIR.exists()
@VIDEO_LOADER_REGISTRY.register("assert_10_frames_1_fps")
class Assert10Frames1FPSVideoLoader(VideoLoader):
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, fps: float = -1.0, **kwargs
) -> npt.NDArray:
assert num_frames == 10, "bad num_frames"
assert fps == 1.0, "bad fps"
return FAKE_OUTPUT_2
def test_video_media_io_kwargs(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "assert_10_frames_1_fps")
imageio = ImageMediaIO()
# Verify that different args pass/fail assertions as expected.
videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 1.0})
_ = videoio.load_bytes(b"test")
videoio = VideoMediaIO(
imageio, **{"num_frames": 10, "fps": 1.0, "not_used": "not_used"}
)
_ = videoio.load_bytes(b"test")
with pytest.raises(AssertionError, match="bad num_frames"):
videoio = VideoMediaIO(imageio, **{})
_ = videoio.load_bytes(b"test")
with pytest.raises(AssertionError, match="bad num_frames"):
videoio = VideoMediaIO(imageio, **{"num_frames": 9, "fps": 1.0})
_ = videoio.load_bytes(b"test")
with pytest.raises(AssertionError, match="bad fps"):
videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 2.0})
_ = videoio.load_bytes(b"test")
@pytest.mark.parametrize("is_color", [True, False])
@pytest.mark.parametrize("fourcc, ext", [("mp4v", "mp4"), ("XVID", "avi")])
def test_opencv_video_io_colorspace(tmp_path, is_color: bool, fourcc: str, ext: str):
"""
Test all functions that use OpenCV for video I/O return RGB format.
Both RGB and grayscale videos are tested.
"""
image_path = get_vllm_public_assets(
filename="stop_sign.jpg", s3_prefix="vision_model_images"
)
image = Image.open(image_path)
if not is_color:
image_path = f"{tmp_path}/test_grayscale_image.png"
image = image.convert("L")
image.save(image_path)
# Convert to gray RGB for comparison
image = image.convert("RGB")
video_path = f"{tmp_path}/test_RGB_video.{ext}"
create_video_from_image(
image_path,
video_path,
num_frames=2,
is_color=is_color,
fourcc=fourcc,
)
frames = video_to_ndarrays(video_path)
for frame in frames:
sim = cosine_similarity(
normalize_image(np.array(frame)), normalize_image(np.array(image))
)
assert np.sum(np.isnan(sim)) / sim.size < 0.001
assert np.nanmean(sim) > 0.99
pil_frames = video_to_pil_images_list(video_path)
for frame in pil_frames:
sim = cosine_similarity(
normalize_image(np.array(frame)), normalize_image(np.array(image))
)
assert np.sum(np.isnan(sim)) / sim.size < 0.001
assert np.nanmean(sim) > 0.99
io_frames, _ = VideoMediaIO(ImageMediaIO()).load_file(Path(video_path))
for frame in io_frames:
sim = cosine_similarity(
normalize_image(np.array(frame)), normalize_image(np.array(image))
)
assert np.sum(np.isnan(sim)) / sim.size < 0.001
assert np.nanmean(sim) > 0.99
NUM_FRAMES = 10
FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_1")
class TestVideoBackendOverride1(VideoLoader):
"""Test loader that returns FAKE_OUTPUT_1 to verify backend selection."""
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, **kwargs
) -> tuple[npt.NDArray, dict]:
return FAKE_OUTPUT_1, {"video_backend": "test_video_backend_override_1"}
@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_2")
class TestVideoBackendOverride2(VideoLoader):
"""Test loader that returns FAKE_OUTPUT_2 to verify backend selection."""
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, **kwargs
) -> tuple[npt.NDArray, dict]:
return FAKE_OUTPUT_2, {"video_backend": "test_video_backend_override_2"}
def test_video_media_io_backend_kwarg_override(monkeypatch: pytest.MonkeyPatch):
"""
Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND
environment variable.
This allows users to dynamically select a different video backend
via --media-io-kwargs without changing the global env var, which is
useful when plugins set a default backend but a specific request
needs a different one.
"""
with monkeypatch.context() as m:
# Set the env var to one backend
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_1")
imageio = ImageMediaIO()
# Without video_backend kwarg, should use env var backend
videoio_default = VideoMediaIO(imageio, num_frames=10)
frames_default, metadata_default = videoio_default.load_bytes(b"test")
np.testing.assert_array_equal(frames_default, FAKE_OUTPUT_1)
assert metadata_default["video_backend"] == "test_video_backend_override_1"
# With video_backend kwarg, should override env var
videoio_override = VideoMediaIO(
imageio, num_frames=10, video_backend="test_video_backend_override_2"
)
frames_override, metadata_override = videoio_override.load_bytes(b"test")
np.testing.assert_array_equal(frames_override, FAKE_OUTPUT_2)
assert metadata_override["video_backend"] == "test_video_backend_override_2"
def test_video_media_io_backend_kwarg_not_passed_to_loader(
monkeypatch: pytest.MonkeyPatch,
):
"""
Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed
through to the underlying video loader's load_bytes method.
This ensures the kwarg is properly popped from kwargs before forwarding.
"""
@VIDEO_LOADER_REGISTRY.register("test_reject_video_backend_kwarg")
class RejectVideoBackendKwargLoader(VideoLoader):
"""Test loader that fails if video_backend is passed through."""
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, **kwargs
) -> tuple[npt.NDArray, dict]:
# This should never receive video_backend in kwargs
if "video_backend" in kwargs:
raise AssertionError(
"video_backend should be consumed by VideoMediaIO, "
"not passed to loader"
)
return FAKE_OUTPUT_1, {"received_kwargs": list(kwargs.keys())}
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_reject_video_backend_kwarg")
imageio = ImageMediaIO()
# Even when video_backend is provided, it should NOT be passed to loader
videoio = VideoMediaIO(
imageio,
num_frames=10,
video_backend="test_reject_video_backend_kwarg",
other_kwarg="should_pass_through",
)
# This should NOT raise AssertionError
frames, metadata = videoio.load_bytes(b"test")
np.testing.assert_array_equal(frames, FAKE_OUTPUT_1)
# Verify other kwargs are still passed through
assert "other_kwarg" in metadata["received_kwargs"]
def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch):
"""
Test that when video_backend kwarg is None or not provided,
VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var.
"""
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_2")
imageio = ImageMediaIO()
# Explicit None should fall back to env var
videoio_none = VideoMediaIO(imageio, num_frames=10, video_backend=None)
frames_none, metadata_none = videoio_none.load_bytes(b"test")
np.testing.assert_array_equal(frames_none, FAKE_OUTPUT_2)
assert metadata_none["video_backend"] == "test_video_backend_override_2"
# Not providing video_backend should also fall back to env var
videoio_missing = VideoMediaIO(imageio, num_frames=10)
frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
assert metadata_missing["video_backend"] == "test_video_backend_override_2"