# SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project from pathlib import Path import numpy as np import numpy.typing as npt import pytest from PIL import Image from vllm.assets.base import get_vllm_public_assets from vllm.assets.video import video_to_ndarrays, video_to_pil_images_list from vllm.multimodal.media import ImageMediaIO, VideoMediaIO from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader from ..utils import cosine_similarity, create_video_from_image, normalize_image pytestmark = pytest.mark.cpu_test ASSETS_DIR = Path(__file__).parent.parent / "assets" assert ASSETS_DIR.exists() @VIDEO_LOADER_REGISTRY.register("assert_10_frames_1_fps") class Assert10Frames1FPSVideoLoader(VideoLoader): @classmethod def load_bytes( cls, data: bytes, num_frames: int = -1, fps: float = -1.0, **kwargs ) -> npt.NDArray: assert num_frames == 10, "bad num_frames" assert fps == 1.0, "bad fps" return FAKE_OUTPUT_2 def test_video_media_io_kwargs(monkeypatch: pytest.MonkeyPatch): with monkeypatch.context() as m: m.setenv("VLLM_VIDEO_LOADER_BACKEND", "assert_10_frames_1_fps") imageio = ImageMediaIO() # Verify that different args pass/fail assertions as expected. videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 1.0}) _ = videoio.load_bytes(b"test") videoio = VideoMediaIO( imageio, **{"num_frames": 10, "fps": 1.0, "not_used": "not_used"} ) _ = videoio.load_bytes(b"test") with pytest.raises(AssertionError, match="bad num_frames"): videoio = VideoMediaIO(imageio, **{}) _ = videoio.load_bytes(b"test") with pytest.raises(AssertionError, match="bad num_frames"): videoio = VideoMediaIO(imageio, **{"num_frames": 9, "fps": 1.0}) _ = videoio.load_bytes(b"test") with pytest.raises(AssertionError, match="bad fps"): videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 2.0}) _ = videoio.load_bytes(b"test") @pytest.mark.parametrize("is_color", [True, False]) @pytest.mark.parametrize("fourcc, ext", [("mp4v", "mp4"), ("XVID", "avi")]) def test_opencv_video_io_colorspace(tmp_path, is_color: bool, fourcc: str, ext: str): """ Test all functions that use OpenCV for video I/O return RGB format. Both RGB and grayscale videos are tested. """ image_path = get_vllm_public_assets( filename="stop_sign.jpg", s3_prefix="vision_model_images" ) image = Image.open(image_path) if not is_color: image_path = f"{tmp_path}/test_grayscale_image.png" image = image.convert("L") image.save(image_path) # Convert to gray RGB for comparison image = image.convert("RGB") video_path = f"{tmp_path}/test_RGB_video.{ext}" create_video_from_image( image_path, video_path, num_frames=2, is_color=is_color, fourcc=fourcc, ) frames = video_to_ndarrays(video_path) for frame in frames: sim = cosine_similarity( normalize_image(np.array(frame)), normalize_image(np.array(image)) ) assert np.sum(np.isnan(sim)) / sim.size < 0.001 assert np.nanmean(sim) > 0.99 pil_frames = video_to_pil_images_list(video_path) for frame in pil_frames: sim = cosine_similarity( normalize_image(np.array(frame)), normalize_image(np.array(image)) ) assert np.sum(np.isnan(sim)) / sim.size < 0.001 assert np.nanmean(sim) > 0.99 io_frames, _ = VideoMediaIO(ImageMediaIO()).load_file(Path(video_path)) for frame in io_frames: sim = cosine_similarity( normalize_image(np.array(frame)), normalize_image(np.array(image)) ) assert np.sum(np.isnan(sim)) / sim.size < 0.001 assert np.nanmean(sim) > 0.99 NUM_FRAMES = 10 FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3) FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3) @VIDEO_LOADER_REGISTRY.register("test_video_backend_override_1") class TestVideoBackendOverride1(VideoLoader): """Test loader that returns FAKE_OUTPUT_1 to verify backend selection.""" @classmethod def load_bytes( cls, data: bytes, num_frames: int = -1, **kwargs ) -> tuple[npt.NDArray, dict]: return FAKE_OUTPUT_1, {"video_backend": "test_video_backend_override_1"} @VIDEO_LOADER_REGISTRY.register("test_video_backend_override_2") class TestVideoBackendOverride2(VideoLoader): """Test loader that returns FAKE_OUTPUT_2 to verify backend selection.""" @classmethod def load_bytes( cls, data: bytes, num_frames: int = -1, **kwargs ) -> tuple[npt.NDArray, dict]: return FAKE_OUTPUT_2, {"video_backend": "test_video_backend_override_2"} def test_video_media_io_backend_kwarg_override(monkeypatch: pytest.MonkeyPatch): """ Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND environment variable. This allows users to dynamically select a different video backend via --media-io-kwargs without changing the global env var, which is useful when plugins set a default backend but a specific request needs a different one. """ with monkeypatch.context() as m: # Set the env var to one backend m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_1") imageio = ImageMediaIO() # Without video_backend kwarg, should use env var backend videoio_default = VideoMediaIO(imageio, num_frames=10) frames_default, metadata_default = videoio_default.load_bytes(b"test") np.testing.assert_array_equal(frames_default, FAKE_OUTPUT_1) assert metadata_default["video_backend"] == "test_video_backend_override_1" # With video_backend kwarg, should override env var videoio_override = VideoMediaIO( imageio, num_frames=10, video_backend="test_video_backend_override_2" ) frames_override, metadata_override = videoio_override.load_bytes(b"test") np.testing.assert_array_equal(frames_override, FAKE_OUTPUT_2) assert metadata_override["video_backend"] == "test_video_backend_override_2" def test_video_media_io_backend_kwarg_not_passed_to_loader( monkeypatch: pytest.MonkeyPatch, ): """ Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed through to the underlying video loader's load_bytes method. This ensures the kwarg is properly popped from kwargs before forwarding. """ @VIDEO_LOADER_REGISTRY.register("test_reject_video_backend_kwarg") class RejectVideoBackendKwargLoader(VideoLoader): """Test loader that fails if video_backend is passed through.""" @classmethod def load_bytes( cls, data: bytes, num_frames: int = -1, **kwargs ) -> tuple[npt.NDArray, dict]: # This should never receive video_backend in kwargs if "video_backend" in kwargs: raise AssertionError( "video_backend should be consumed by VideoMediaIO, " "not passed to loader" ) return FAKE_OUTPUT_1, {"received_kwargs": list(kwargs.keys())} with monkeypatch.context() as m: m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_reject_video_backend_kwarg") imageio = ImageMediaIO() # Even when video_backend is provided, it should NOT be passed to loader videoio = VideoMediaIO( imageio, num_frames=10, video_backend="test_reject_video_backend_kwarg", other_kwarg="should_pass_through", ) # This should NOT raise AssertionError frames, metadata = videoio.load_bytes(b"test") np.testing.assert_array_equal(frames, FAKE_OUTPUT_1) # Verify other kwargs are still passed through assert "other_kwarg" in metadata["received_kwargs"] def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch): """ Test that when video_backend kwarg is None or not provided, VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var. """ with monkeypatch.context() as m: m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_2") imageio = ImageMediaIO() # Explicit None should fall back to env var videoio_none = VideoMediaIO(imageio, num_frames=10, video_backend=None) frames_none, metadata_none = videoio_none.load_bytes(b"test") np.testing.assert_array_equal(frames_none, FAKE_OUTPUT_2) assert metadata_none["video_backend"] == "test_video_backend_override_2" # Not providing video_backend should also fall back to env var videoio_missing = VideoMediaIO(imageio, num_frames=10) frames_missing, metadata_missing = videoio_missing.load_bytes(b"test") np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2) assert metadata_missing["video_backend"] == "test_video_backend_override_2"