[Bugfix] Fix base64 JPEG video frames returning empty metadata (#37301)

Signed-off-by: Yufeng He <40085740+universeplayer@users.noreply.github.com> Signed-off-by: Yufeng He <40085740+he-yufeng@users.noreply.github.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Yufeng He <40085740+universeplayer@users.noreply.github.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn>
2026-03-18 21:40:03 +08:00
parent 98b09ddc27
commit 918b7890a1
2 changed files with 66 additions and 2 deletions
--- a/tests/multimodal/media/test_video.py
+++ b/tests/multimodal/media/test_video.py
@@ -1,9 +1,11 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import io
 from pathlib import Path

 import numpy as np
 import numpy.typing as npt
+import pybase64
 import pytest
 from PIL import Image

@@ -235,3 +237,53 @@ def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch
        frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
        np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
        assert metadata_missing["video_backend"] == "test_video_backend_override_2"
+
+
+def test_load_base64_jpeg_returns_metadata():
+    """Regression test: load_base64 with video/jpeg must return metadata.
+
+    Previously, base64 JPEG frame sequences returned an empty dict for
+    metadata, which broke downstream consumers that rely on fields like
+    total_num_frames and fps. See PR #37301.
+    """
+
+    num_test_frames = 3
+    frame_width, frame_height = 8, 8
+
+    # Build a few tiny JPEG frames and base64-encode them
+    b64_frames = []
+    for i in range(num_test_frames):
+        img = Image.new("RGB", (frame_width, frame_height), color=(i * 80, 0, 0))
+        buf = io.BytesIO()
+        img.save(buf, format="JPEG")
+        b64_frames.append(pybase64.b64encode(buf.getvalue()).decode("ascii"))
+
+    data = ",".join(b64_frames)
+
+    imageio = ImageMediaIO()
+    videoio = VideoMediaIO(imageio, num_frames=num_test_frames)
+    frames, metadata = videoio.load_base64("video/jpeg", data)
+
+    # Frames array shape: (num_frames, H, W, 3)
+    assert frames.shape[0] == num_test_frames
+
+    # All required metadata keys must be present
+    required_keys = {
+        "total_num_frames",
+        "fps",
+        "duration",
+        "video_backend",
+        "frames_indices",
+        "do_sample_frames",
+    }
+    assert required_keys.issubset(metadata.keys()), (
+        f"Missing metadata keys: {required_keys - metadata.keys()}"
+    )
+
+    assert metadata["total_num_frames"] == num_test_frames
+    assert metadata["video_backend"] == "jpeg_sequence"
+    assert metadata["frames_indices"] == list(range(num_test_frames))
+    assert metadata["do_sample_frames"] is False
+    # Default fps=1 → duration == num_frames
+    assert metadata["fps"] == 1.0
+    assert metadata["duration"] == float(num_test_frames)
--- a/vllm/multimodal/media/video.py
+++ b/vllm/multimodal/media/video.py
@@ -80,9 +80,21 @@ class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
                "image/jpeg",
            )

-            return np.stack(
+            frames = np.stack(
                [np.asarray(load_frame(frame_data)) for frame_data in data.split(",")]
-            ), {}
+            )
+            total = int(frames.shape[0])
+            fps = float(self.kwargs.get("fps", 1))
+            duration = total / fps if fps > 0 else 0.0
+            metadata = {
+                "total_num_frames": total,
+                "fps": fps,
+                "duration": duration,
+                "video_backend": "jpeg_sequence",
+                "frames_indices": list(range(total)),
+                "do_sample_frames": False,
+            }
+            return frames, metadata

        return self.load_bytes(pybase64.b64decode(data))