diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py index ce1429fda..478184c34 100644 --- a/tests/multimodal/test_utils.py +++ b/tests/multimodal/test_utils.py @@ -26,6 +26,11 @@ TEST_IMAGE_URLS = [ "https://upload.wikimedia.org/wikipedia/commons/0/0b/RGBA_comp.png", ] +TEST_VIDEO_URLS = [ + "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4", + "https://filesamples.com/samples/video/avi/sample_640x360.avi", +] + @pytest.fixture(scope="module") def url_images() -> dict[str, Image.Image]: @@ -134,6 +139,18 @@ async def test_fetch_image_local_files(image_url: str): f"file://{temp_dir}/../{os.path.basename(image_url)}") +@pytest.mark.asyncio +@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS) +@pytest.mark.parametrize("num_frames", [-1, 32, 1800]) +async def test_fetch_video_http(video_url: str, num_frames: int): + connector = MediaConnector() + + video_sync = connector.fetch_video(video_url, num_frames=num_frames) + video_async = await connector.fetch_video_async(video_url, + num_frames=num_frames) + assert np.array_equal(video_sync, video_async) + + # Used for the next two tests related to `merge_and_sort_multimodal_metadata`. class TestCase(NamedTuple): mm_positions: "MultiModalPlaceholderDict" diff --git a/vllm/multimodal/video.py b/vllm/multimodal/video.py index 6d875a1c6..72e9b65d7 100644 --- a/vllm/multimodal/video.py +++ b/vllm/multimodal/video.py @@ -81,7 +81,8 @@ class OpenCVVideoBackend(VideoLoader): total_frames_num = int(cap.get(cv2.CAP_PROP_FRAME_COUNT)) full_read = num_frames == -1 or total_frames_num < num_frames if full_read: - frame_idx = list(range(0, total_frames_num)) + num_frames = total_frames_num + frame_idx = list(range(0, num_frames)) else: uniform_sampled_frames = np.linspace(0, total_frames_num - 1, @@ -104,7 +105,8 @@ class OpenCVVideoBackend(VideoLoader): frames[i] = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) i += 1 # we expect all frames loaded - assert i == num_frames + assert i == num_frames, (f"Expected reading {num_frames} frames, " + f"but only loaded {i} frames from video.") return frames