diff --git a/tests/multimodal/media/test_video.py b/tests/multimodal/media/test_video.py index 9c04d991a..a1223ebc0 100644 --- a/tests/multimodal/media/test_video.py +++ b/tests/multimodal/media/test_video.py @@ -1,9 +1,11 @@ # SPDX-License-Identifier: Apache-2.0 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project +import io from pathlib import Path import numpy as np import numpy.typing as npt +import pybase64 import pytest from PIL import Image @@ -235,3 +237,53 @@ def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch frames_missing, metadata_missing = videoio_missing.load_bytes(b"test") np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2) assert metadata_missing["video_backend"] == "test_video_backend_override_2" + + +def test_load_base64_jpeg_returns_metadata(): + """Regression test: load_base64 with video/jpeg must return metadata. + + Previously, base64 JPEG frame sequences returned an empty dict for + metadata, which broke downstream consumers that rely on fields like + total_num_frames and fps. See PR #37301. + """ + + num_test_frames = 3 + frame_width, frame_height = 8, 8 + + # Build a few tiny JPEG frames and base64-encode them + b64_frames = [] + for i in range(num_test_frames): + img = Image.new("RGB", (frame_width, frame_height), color=(i * 80, 0, 0)) + buf = io.BytesIO() + img.save(buf, format="JPEG") + b64_frames.append(pybase64.b64encode(buf.getvalue()).decode("ascii")) + + data = ",".join(b64_frames) + + imageio = ImageMediaIO() + videoio = VideoMediaIO(imageio, num_frames=num_test_frames) + frames, metadata = videoio.load_base64("video/jpeg", data) + + # Frames array shape: (num_frames, H, W, 3) + assert frames.shape[0] == num_test_frames + + # All required metadata keys must be present + required_keys = { + "total_num_frames", + "fps", + "duration", + "video_backend", + "frames_indices", + "do_sample_frames", + } + assert required_keys.issubset(metadata.keys()), ( + f"Missing metadata keys: {required_keys - metadata.keys()}" + ) + + assert metadata["total_num_frames"] == num_test_frames + assert metadata["video_backend"] == "jpeg_sequence" + assert metadata["frames_indices"] == list(range(num_test_frames)) + assert metadata["do_sample_frames"] is False + # Default fps=1 → duration == num_frames + assert metadata["fps"] == 1.0 + assert metadata["duration"] == float(num_test_frames) diff --git a/vllm/multimodal/media/video.py b/vllm/multimodal/media/video.py index 9784a1560..2790d714d 100644 --- a/vllm/multimodal/media/video.py +++ b/vllm/multimodal/media/video.py @@ -80,9 +80,21 @@ class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]): "image/jpeg", ) - return np.stack( + frames = np.stack( [np.asarray(load_frame(frame_data)) for frame_data in data.split(",")] - ), {} + ) + total = int(frames.shape[0]) + fps = float(self.kwargs.get("fps", 1)) + duration = total / fps if fps > 0 else 0.0 + metadata = { + "total_num_frames": total, + "fps": fps, + "duration": duration, + "video_backend": "jpeg_sequence", + "frames_indices": list(range(total)), + "do_sample_frames": False, + } + return frames, metadata return self.load_bytes(pybase64.b64decode(data))