[3/N] Group together media-related code (#32406)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-01-15 19:52:12 +08:00
committed by GitHub
parent 8853a50af2
commit 28459785ff
25 changed files with 833 additions and 725 deletions

View File

@@ -63,7 +63,7 @@ from vllm.distributed import (
)
from vllm.logger import init_logger
from vllm.logprobs import Logprob
from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.media import MediaWithBytes
from vllm.multimodal.utils import fetch_image
from vllm.outputs import RequestOutput
from vllm.sampling_params import BeamSearchParams

View File

@@ -14,8 +14,7 @@ import pytest
import torch
from vllm.entrypoints.renderer import CompletionRenderer
from vllm.multimodal.audio import AudioEmbeddingMediaIO
from vllm.multimodal.image import ImageEmbeddingMediaIO
from vllm.multimodal.media import AudioEmbeddingMediaIO, ImageEmbeddingMediaIO
def _encode_tensor(tensor: torch.Tensor) -> bytes:

View File

@@ -8,7 +8,7 @@ import pytest
import pytest_asyncio
from transformers import AutoProcessor
from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.media import MediaWithBytes
from vllm.multimodal.utils import encode_image_url, fetch_image
from vllm.platforms import current_platform

View File

@@ -9,7 +9,7 @@ from transformers import AutoProcessor
from tests.utils import VLLM_PATH, RemoteOpenAIServer
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.media import MediaWithBytes
from vllm.multimodal.utils import fetch_image
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"

View File

View File

@@ -0,0 +1,73 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
from pathlib import Path
from unittest.mock import patch
import numpy as np
import pytest
from vllm.multimodal.media import AudioMediaIO
pytestmark = pytest.mark.cpu_test
ASSETS_DIR = Path(__file__).parent.parent / "assets"
assert ASSETS_DIR.exists()
@pytest.fixture
def dummy_audio():
return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float)
@pytest.fixture
def dummy_audio_bytes():
return b"FAKEAUDIOBYTES"
def test_audio_media_io_load_bytes(dummy_audio_bytes):
audio_io = AudioMediaIO()
with patch("librosa.load") as mock_load:
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_bytes(dummy_audio_bytes)
mock_load.assert_called_once()
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_load_base64(dummy_audio_bytes):
audio_io = AudioMediaIO()
encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8")
with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes:
mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_base64("audio/wav", encoded)
mock_load_bytes.assert_called_once()
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_load_file():
audio_io = AudioMediaIO()
path = Path("/fake/path.wav")
with patch("librosa.load") as mock_load:
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_file(path)
mock_load.assert_called_once_with(path, sr=None)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_encode_base64(dummy_audio):
audio_io = AudioMediaIO()
media = (dummy_audio, 16000)
with patch("soundfile.write") as mock_write:
def write_to_buffer(buffer, *_args, **_kwargs):
buffer.write(b"dummy_wav_data")
mock_write.side_effect = write_to_buffer
out = audio_io.encode_base64(media)
decoded = base64.b64decode(out)
assert decoded == b"dummy_wav_data"
mock_write.assert_called_once()

View File

@@ -0,0 +1,45 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pickle
from pathlib import Path
import pytest
from PIL import Image
from vllm.multimodal.media import MediaWithBytes
pytestmark = pytest.mark.cpu_test
ASSETS_DIR = Path(__file__).parent.parent / "assets"
assert ASSETS_DIR.exists()
def test_media_with_bytes_pickle_roundtrip():
"""Regression test for pickle/unpickle of MediaWithBytes.
Verifies that MediaWithBytes can be pickled and unpickled without
RecursionError. See: https://github.com/vllm-project/vllm/issues/30818
"""
original_image = Image.open(ASSETS_DIR / "image1.png").convert("RGB")
original_bytes = b"test_bytes_data"
wrapper = MediaWithBytes(media=original_image, original_bytes=original_bytes)
# Verify attribute delegation works before pickling
assert wrapper.width == original_image.width
assert wrapper.height == original_image.height
assert wrapper.mode == original_image.mode
# Pickle and unpickle (this would cause RecursionError before the fix)
pickled = pickle.dumps(wrapper)
unpickled = pickle.loads(pickled)
# Verify the unpickled object works correctly
assert unpickled.original_bytes == original_bytes
assert unpickled.media.width == original_image.width
assert unpickled.media.height == original_image.height
# Verify attribute delegation works after unpickling
assert unpickled.width == original_image.width
assert unpickled.height == original_image.height
assert unpickled.mode == original_image.mode

View File

@@ -0,0 +1,133 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
import numpy as np
import pytest
from PIL import Image
from vllm.multimodal.media import ImageMediaIO
pytestmark = pytest.mark.cpu_test
ASSETS_DIR = Path(__file__).parent.parent / "assets"
assert ASSETS_DIR.exists()
def test_image_media_io_rgba_custom_background(tmp_path):
"""Test RGBA to RGB conversion with custom background colors."""
# Create a simple RGBA image with transparent and opaque pixels
rgba_image = Image.new("RGBA", (10, 10), (255, 0, 0, 255)) # Red with full opacity
# Make top-left quadrant transparent
for i in range(5):
for j in range(5):
rgba_image.putpixel((i, j), (0, 0, 0, 0)) # Fully transparent
# Save the test image to tmp_path
test_image_path = tmp_path / "test_rgba.png"
rgba_image.save(test_image_path)
# Test 1: Default white background (backward compatibility)
image_io_default = ImageMediaIO()
converted_default = image_io_default.load_file(test_image_path)
default_numpy = np.array(converted_default)
# Check transparent pixels are white
assert default_numpy[0][0][0] == 255 # R
assert default_numpy[0][0][1] == 255 # G
assert default_numpy[0][0][2] == 255 # B
# Check opaque pixels remain red
assert default_numpy[5][5][0] == 255 # R
assert default_numpy[5][5][1] == 0 # G
assert default_numpy[5][5][2] == 0 # B
# Test 2: Custom black background via kwargs
image_io_black = ImageMediaIO(rgba_background_color=(0, 0, 0))
converted_black = image_io_black.load_file(test_image_path)
black_numpy = np.array(converted_black)
# Check transparent pixels are black
assert black_numpy[0][0][0] == 0 # R
assert black_numpy[0][0][1] == 0 # G
assert black_numpy[0][0][2] == 0 # B
# Check opaque pixels remain red
assert black_numpy[5][5][0] == 255 # R
assert black_numpy[5][5][1] == 0 # G
assert black_numpy[5][5][2] == 0 # B
# Test 3: Custom blue background via kwargs (as list)
image_io_blue = ImageMediaIO(rgba_background_color=[0, 0, 255])
converted_blue = image_io_blue.load_file(test_image_path)
blue_numpy = np.array(converted_blue)
# Check transparent pixels are blue
assert blue_numpy[0][0][0] == 0 # R
assert blue_numpy[0][0][1] == 0 # G
assert blue_numpy[0][0][2] == 255 # B
# Test 4: Test with load_bytes method
with open(test_image_path, "rb") as f:
image_data = f.read()
image_io_green = ImageMediaIO(rgba_background_color=(0, 255, 0))
converted_green = image_io_green.load_bytes(image_data)
green_numpy = np.array(converted_green)
# Check transparent pixels are green
assert green_numpy[0][0][0] == 0 # R
assert green_numpy[0][0][1] == 255 # G
assert green_numpy[0][0][2] == 0 # B
def test_image_media_io_rgba_background_color_validation():
"""Test that invalid rgba_background_color values are properly rejected."""
# Test invalid types
with pytest.raises(
ValueError, match="rgba_background_color must be a list or tuple"
):
ImageMediaIO(rgba_background_color="255,255,255")
with pytest.raises(
ValueError, match="rgba_background_color must be a list or tuple"
):
ImageMediaIO(rgba_background_color=255)
# Test wrong number of elements
with pytest.raises(
ValueError, match="rgba_background_color must be a list or tuple"
):
ImageMediaIO(rgba_background_color=(255, 255))
with pytest.raises(
ValueError, match="rgba_background_color must be a list or tuple"
):
ImageMediaIO(rgba_background_color=(255, 255, 255, 255))
# Test non-integer values
with pytest.raises(
ValueError, match="rgba_background_color must be a list or tuple"
):
ImageMediaIO(rgba_background_color=(255.0, 255.0, 255.0))
with pytest.raises(
ValueError, match="rgba_background_color must be a list or tuple"
):
ImageMediaIO(rgba_background_color=(255, "255", 255))
# Test out of range values
with pytest.raises(
ValueError, match="rgba_background_color must be a list or tuple"
):
ImageMediaIO(rgba_background_color=(256, 255, 255))
with pytest.raises(
ValueError, match="rgba_background_color must be a list or tuple"
):
ImageMediaIO(rgba_background_color=(255, -1, 255))
# Test that valid values work
ImageMediaIO(rgba_background_color=(0, 0, 0)) # Should not raise
ImageMediaIO(rgba_background_color=[255, 255, 255]) # Should not raise
ImageMediaIO(rgba_background_color=(128, 128, 128)) # Should not raise

View File

@@ -0,0 +1,237 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from pathlib import Path
import numpy as np
import numpy.typing as npt
import pytest
from PIL import Image
from vllm.assets.base import get_vllm_public_assets
from vllm.assets.video import video_to_ndarrays, video_to_pil_images_list
from vllm.multimodal.media import ImageMediaIO, VideoMediaIO
from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader
from ..utils import cosine_similarity, create_video_from_image, normalize_image
pytestmark = pytest.mark.cpu_test
ASSETS_DIR = Path(__file__).parent.parent / "assets"
assert ASSETS_DIR.exists()
@VIDEO_LOADER_REGISTRY.register("assert_10_frames_1_fps")
class Assert10Frames1FPSVideoLoader(VideoLoader):
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, fps: float = -1.0, **kwargs
) -> npt.NDArray:
assert num_frames == 10, "bad num_frames"
assert fps == 1.0, "bad fps"
return FAKE_OUTPUT_2
def test_video_media_io_kwargs(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "assert_10_frames_1_fps")
imageio = ImageMediaIO()
# Verify that different args pass/fail assertions as expected.
videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 1.0})
_ = videoio.load_bytes(b"test")
videoio = VideoMediaIO(
imageio, **{"num_frames": 10, "fps": 1.0, "not_used": "not_used"}
)
_ = videoio.load_bytes(b"test")
with pytest.raises(AssertionError, match="bad num_frames"):
videoio = VideoMediaIO(imageio, **{})
_ = videoio.load_bytes(b"test")
with pytest.raises(AssertionError, match="bad num_frames"):
videoio = VideoMediaIO(imageio, **{"num_frames": 9, "fps": 1.0})
_ = videoio.load_bytes(b"test")
with pytest.raises(AssertionError, match="bad fps"):
videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 2.0})
_ = videoio.load_bytes(b"test")
@pytest.mark.parametrize("is_color", [True, False])
@pytest.mark.parametrize("fourcc, ext", [("mp4v", "mp4"), ("XVID", "avi")])
def test_opencv_video_io_colorspace(tmp_path, is_color: bool, fourcc: str, ext: str):
"""
Test all functions that use OpenCV for video I/O return RGB format.
Both RGB and grayscale videos are tested.
"""
image_path = get_vllm_public_assets(
filename="stop_sign.jpg", s3_prefix="vision_model_images"
)
image = Image.open(image_path)
if not is_color:
image_path = f"{tmp_path}/test_grayscale_image.png"
image = image.convert("L")
image.save(image_path)
# Convert to gray RGB for comparison
image = image.convert("RGB")
video_path = f"{tmp_path}/test_RGB_video.{ext}"
create_video_from_image(
image_path,
video_path,
num_frames=2,
is_color=is_color,
fourcc=fourcc,
)
frames = video_to_ndarrays(video_path)
for frame in frames:
sim = cosine_similarity(
normalize_image(np.array(frame)), normalize_image(np.array(image))
)
assert np.sum(np.isnan(sim)) / sim.size < 0.001
assert np.nanmean(sim) > 0.99
pil_frames = video_to_pil_images_list(video_path)
for frame in pil_frames:
sim = cosine_similarity(
normalize_image(np.array(frame)), normalize_image(np.array(image))
)
assert np.sum(np.isnan(sim)) / sim.size < 0.001
assert np.nanmean(sim) > 0.99
io_frames, _ = VideoMediaIO(ImageMediaIO()).load_file(Path(video_path))
for frame in io_frames:
sim = cosine_similarity(
normalize_image(np.array(frame)), normalize_image(np.array(image))
)
assert np.sum(np.isnan(sim)) / sim.size < 0.001
assert np.nanmean(sim) > 0.99
NUM_FRAMES = 10
FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_1")
class TestVideoBackendOverride1(VideoLoader):
"""Test loader that returns FAKE_OUTPUT_1 to verify backend selection."""
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, **kwargs
) -> tuple[npt.NDArray, dict]:
return FAKE_OUTPUT_1, {"video_backend": "test_video_backend_override_1"}
@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_2")
class TestVideoBackendOverride2(VideoLoader):
"""Test loader that returns FAKE_OUTPUT_2 to verify backend selection."""
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, **kwargs
) -> tuple[npt.NDArray, dict]:
return FAKE_OUTPUT_2, {"video_backend": "test_video_backend_override_2"}
def test_video_media_io_backend_kwarg_override(monkeypatch: pytest.MonkeyPatch):
"""
Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND
environment variable.
This allows users to dynamically select a different video backend
via --media-io-kwargs without changing the global env var, which is
useful when plugins set a default backend but a specific request
needs a different one.
"""
with monkeypatch.context() as m:
# Set the env var to one backend
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_1")
imageio = ImageMediaIO()
# Without video_backend kwarg, should use env var backend
videoio_default = VideoMediaIO(imageio, num_frames=10)
frames_default, metadata_default = videoio_default.load_bytes(b"test")
np.testing.assert_array_equal(frames_default, FAKE_OUTPUT_1)
assert metadata_default["video_backend"] == "test_video_backend_override_1"
# With video_backend kwarg, should override env var
videoio_override = VideoMediaIO(
imageio, num_frames=10, video_backend="test_video_backend_override_2"
)
frames_override, metadata_override = videoio_override.load_bytes(b"test")
np.testing.assert_array_equal(frames_override, FAKE_OUTPUT_2)
assert metadata_override["video_backend"] == "test_video_backend_override_2"
def test_video_media_io_backend_kwarg_not_passed_to_loader(
monkeypatch: pytest.MonkeyPatch,
):
"""
Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed
through to the underlying video loader's load_bytes method.
This ensures the kwarg is properly popped from kwargs before forwarding.
"""
@VIDEO_LOADER_REGISTRY.register("test_reject_video_backend_kwarg")
class RejectVideoBackendKwargLoader(VideoLoader):
"""Test loader that fails if video_backend is passed through."""
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, **kwargs
) -> tuple[npt.NDArray, dict]:
# This should never receive video_backend in kwargs
if "video_backend" in kwargs:
raise AssertionError(
"video_backend should be consumed by VideoMediaIO, "
"not passed to loader"
)
return FAKE_OUTPUT_1, {"received_kwargs": list(kwargs.keys())}
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_reject_video_backend_kwarg")
imageio = ImageMediaIO()
# Even when video_backend is provided, it should NOT be passed to loader
videoio = VideoMediaIO(
imageio,
num_frames=10,
video_backend="test_reject_video_backend_kwarg",
other_kwarg="should_pass_through",
)
# This should NOT raise AssertionError
frames, metadata = videoio.load_bytes(b"test")
np.testing.assert_array_equal(frames, FAKE_OUTPUT_1)
# Verify other kwargs are still passed through
assert "other_kwarg" in metadata["received_kwargs"]
def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch):
"""
Test that when video_backend kwarg is None or not provided,
VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var.
"""
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_2")
imageio = ImageMediaIO()
# Explicit None should fall back to env var
videoio_none = VideoMediaIO(imageio, num_frames=10, video_backend=None)
frames_none, metadata_none = videoio_none.load_bytes(b"test")
np.testing.assert_array_equal(frames_none, FAKE_OUTPUT_2)
assert metadata_none["video_backend"] == "test_video_backend_override_2"
# Not providing video_backend should also fall back to env var
videoio_missing = VideoMediaIO(imageio, num_frames=10)
frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
assert metadata_missing["video_backend"] == "test_video_backend_override_2"

View File

@@ -1,8 +1,6 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
# test_audio.py
import base64
from pathlib import Path
from unittest.mock import patch
import numpy as np
@@ -12,7 +10,6 @@ import torch
from vllm.multimodal.audio import (
MONO_AUDIO_SPEC,
PASSTHROUGH_AUDIO_SPEC,
AudioMediaIO,
AudioResampler,
AudioSpec,
ChannelReduction,
@@ -92,59 +89,6 @@ def test_audio_resampler_no_target_sr(dummy_audio):
resampler.resample(dummy_audio, orig_sr=44100)
@pytest.fixture
def dummy_audio_bytes():
return b"FAKEAUDIOBYTES"
def test_audio_media_io_load_bytes(dummy_audio_bytes):
audio_io = AudioMediaIO()
with patch("vllm.multimodal.audio.librosa.load") as mock_load:
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_bytes(dummy_audio_bytes)
mock_load.assert_called_once()
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_load_base64(dummy_audio_bytes):
audio_io = AudioMediaIO()
encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8")
with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes:
mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_base64("audio/wav", encoded)
mock_load_bytes.assert_called_once()
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_load_file():
audio_io = AudioMediaIO()
path = Path("/fake/path.wav")
with patch("vllm.multimodal.audio.librosa.load") as mock_load:
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
out = audio_io.load_file(path)
mock_load.assert_called_once_with(path, sr=None)
assert isinstance(out[0], np.ndarray)
assert out[1] == 16000
def test_audio_media_io_encode_base64(dummy_audio):
audio_io = AudioMediaIO()
media = (dummy_audio, 16000)
with patch("vllm.multimodal.audio.soundfile.write") as mock_write:
def write_to_buffer(buffer, *_args, **_kwargs):
buffer.write(b"dummy_wav_data")
mock_write.side_effect = write_to_buffer
out = audio_io.encode_base64(media)
decoded = base64.b64decode(out)
assert decoded == b"dummy_wav_data"
mock_write.assert_called_once()
# ============================================================
# Tests for normalize_audio function
# ============================================================

View File

@@ -1,14 +1,12 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pickle
from pathlib import Path
import numpy as np
import pytest
from PIL import Image, ImageChops
from vllm.multimodal.base import MediaWithBytes
from vllm.multimodal.image import ImageMediaIO, convert_image_mode
from vllm.multimodal.image import convert_image_mode
pytestmark = pytest.mark.cpu_test
@@ -40,153 +38,3 @@ def test_rgba_to_rgb():
assert converted_image_numpy[i][j][0] == 255
assert converted_image_numpy[i][j][1] == 255
assert converted_image_numpy[i][j][2] == 255
def test_rgba_to_rgb_custom_background(tmp_path):
"""Test RGBA to RGB conversion with custom background colors."""
# Create a simple RGBA image with transparent and opaque pixels
rgba_image = Image.new("RGBA", (10, 10), (255, 0, 0, 255)) # Red with full opacity
# Make top-left quadrant transparent
for i in range(5):
for j in range(5):
rgba_image.putpixel((i, j), (0, 0, 0, 0)) # Fully transparent
# Save the test image to tmp_path
test_image_path = tmp_path / "test_rgba.png"
rgba_image.save(test_image_path)
# Test 1: Default white background (backward compatibility)
image_io_default = ImageMediaIO()
converted_default = image_io_default.load_file(test_image_path)
default_numpy = np.array(converted_default)
# Check transparent pixels are white
assert default_numpy[0][0][0] == 255 # R
assert default_numpy[0][0][1] == 255 # G
assert default_numpy[0][0][2] == 255 # B
# Check opaque pixels remain red
assert default_numpy[5][5][0] == 255 # R
assert default_numpy[5][5][1] == 0 # G
assert default_numpy[5][5][2] == 0 # B
# Test 2: Custom black background via kwargs
image_io_black = ImageMediaIO(rgba_background_color=(0, 0, 0))
converted_black = image_io_black.load_file(test_image_path)
black_numpy = np.array(converted_black)
# Check transparent pixels are black
assert black_numpy[0][0][0] == 0 # R
assert black_numpy[0][0][1] == 0 # G
assert black_numpy[0][0][2] == 0 # B
# Check opaque pixels remain red
assert black_numpy[5][5][0] == 255 # R
assert black_numpy[5][5][1] == 0 # G
assert black_numpy[5][5][2] == 0 # B
# Test 3: Custom blue background via kwargs (as list)
image_io_blue = ImageMediaIO(rgba_background_color=[0, 0, 255])
converted_blue = image_io_blue.load_file(test_image_path)
blue_numpy = np.array(converted_blue)
# Check transparent pixels are blue
assert blue_numpy[0][0][0] == 0 # R
assert blue_numpy[0][0][1] == 0 # G
assert blue_numpy[0][0][2] == 255 # B
# Test 4: Test with load_bytes method
with open(test_image_path, "rb") as f:
image_data = f.read()
image_io_green = ImageMediaIO(rgba_background_color=(0, 255, 0))
converted_green = image_io_green.load_bytes(image_data)
green_numpy = np.array(converted_green)
# Check transparent pixels are green
assert green_numpy[0][0][0] == 0 # R
assert green_numpy[0][0][1] == 255 # G
assert green_numpy[0][0][2] == 0 # B
def test_rgba_background_color_validation():
"""Test that invalid rgba_background_color values are properly rejected."""
# Test invalid types
with pytest.raises(
ValueError, match="rgba_background_color must be a list or tuple"
):
ImageMediaIO(rgba_background_color="255,255,255")
with pytest.raises(
ValueError, match="rgba_background_color must be a list or tuple"
):
ImageMediaIO(rgba_background_color=255)
# Test wrong number of elements
with pytest.raises(
ValueError, match="rgba_background_color must be a list or tuple"
):
ImageMediaIO(rgba_background_color=(255, 255))
with pytest.raises(
ValueError, match="rgba_background_color must be a list or tuple"
):
ImageMediaIO(rgba_background_color=(255, 255, 255, 255))
# Test non-integer values
with pytest.raises(
ValueError, match="rgba_background_color must be a list or tuple"
):
ImageMediaIO(rgba_background_color=(255.0, 255.0, 255.0))
with pytest.raises(
ValueError, match="rgba_background_color must be a list or tuple"
):
ImageMediaIO(rgba_background_color=(255, "255", 255))
# Test out of range values
with pytest.raises(
ValueError, match="rgba_background_color must be a list or tuple"
):
ImageMediaIO(rgba_background_color=(256, 255, 255))
with pytest.raises(
ValueError, match="rgba_background_color must be a list or tuple"
):
ImageMediaIO(rgba_background_color=(255, -1, 255))
# Test that valid values work
ImageMediaIO(rgba_background_color=(0, 0, 0)) # Should not raise
ImageMediaIO(rgba_background_color=[255, 255, 255]) # Should not raise
ImageMediaIO(rgba_background_color=(128, 128, 128)) # Should not raise
def test_media_with_bytes_pickle_roundtrip():
"""Regression test for pickle/unpickle of MediaWithBytes.
Verifies that MediaWithBytes can be pickled and unpickled without
RecursionError. See: https://github.com/vllm-project/vllm/issues/30818
"""
original_image = Image.open(ASSETS_DIR / "image1.png").convert("RGB")
original_bytes = b"test_bytes_data"
wrapper = MediaWithBytes(media=original_image, original_bytes=original_bytes)
# Verify attribute delegation works before pickling
assert wrapper.width == original_image.width
assert wrapper.height == original_image.height
assert wrapper.mode == original_image.mode
# Pickle and unpickle (this would cause RecursionError before the fix)
pickled = pickle.dumps(wrapper)
unpickled = pickle.loads(pickled)
# Verify the unpickled object works correctly
assert unpickled.original_bytes == original_bytes
assert unpickled.media.width == original_image.width
assert unpickled.media.height == original_image.height
# Verify attribute delegation works after unpickling
assert unpickled.width == original_image.width
assert unpickled.height == original_image.height
assert unpickled.mode == original_image.mode

View File

@@ -1,24 +1,19 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import tempfile
from pathlib import Path
import numpy as np
import numpy.typing as npt
import pytest
from PIL import Image
from vllm.assets.base import get_vllm_public_assets
from vllm.assets.video import video_to_ndarrays, video_to_pil_images_list
from vllm.multimodal.image import ImageMediaIO
from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader, VideoMediaIO
from .utils import cosine_similarity, create_video_from_image, normalize_image
from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader
pytestmark = pytest.mark.cpu_test
ASSETS_DIR = Path(__file__).parent / "assets"
assert ASSETS_DIR.exists()
NUM_FRAMES = 10
FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
@@ -53,96 +48,6 @@ def test_video_loader_type_doesnt_exist():
VIDEO_LOADER_REGISTRY.load("non_existing_video_loader")
@VIDEO_LOADER_REGISTRY.register("assert_10_frames_1_fps")
class Assert10Frames1FPSVideoLoader(VideoLoader):
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, fps: float = -1.0, **kwargs
) -> npt.NDArray:
assert num_frames == 10, "bad num_frames"
assert fps == 1.0, "bad fps"
return FAKE_OUTPUT_2
def test_video_media_io_kwargs(monkeypatch: pytest.MonkeyPatch):
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "assert_10_frames_1_fps")
imageio = ImageMediaIO()
# Verify that different args pass/fail assertions as expected.
videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 1.0})
_ = videoio.load_bytes(b"test")
videoio = VideoMediaIO(
imageio, **{"num_frames": 10, "fps": 1.0, "not_used": "not_used"}
)
_ = videoio.load_bytes(b"test")
with pytest.raises(AssertionError, match="bad num_frames"):
videoio = VideoMediaIO(imageio, **{})
_ = videoio.load_bytes(b"test")
with pytest.raises(AssertionError, match="bad num_frames"):
videoio = VideoMediaIO(imageio, **{"num_frames": 9, "fps": 1.0})
_ = videoio.load_bytes(b"test")
with pytest.raises(AssertionError, match="bad fps"):
videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 2.0})
_ = videoio.load_bytes(b"test")
@pytest.mark.parametrize("is_color", [True, False])
@pytest.mark.parametrize("fourcc, ext", [("mp4v", "mp4"), ("XVID", "avi")])
def test_opencv_video_io_colorspace(is_color: bool, fourcc: str, ext: str):
"""
Test all functions that use OpenCV for video I/O return RGB format.
Both RGB and grayscale videos are tested.
"""
image_path = get_vllm_public_assets(
filename="stop_sign.jpg", s3_prefix="vision_model_images"
)
image = Image.open(image_path)
with tempfile.TemporaryDirectory() as tmpdir:
if not is_color:
image_path = f"{tmpdir}/test_grayscale_image.png"
image = image.convert("L")
image.save(image_path)
# Convert to gray RGB for comparison
image = image.convert("RGB")
video_path = f"{tmpdir}/test_RGB_video.{ext}"
create_video_from_image(
image_path,
video_path,
num_frames=2,
is_color=is_color,
fourcc=fourcc,
)
frames = video_to_ndarrays(video_path)
for frame in frames:
sim = cosine_similarity(
normalize_image(np.array(frame)), normalize_image(np.array(image))
)
assert np.sum(np.isnan(sim)) / sim.size < 0.001
assert np.nanmean(sim) > 0.99
pil_frames = video_to_pil_images_list(video_path)
for frame in pil_frames:
sim = cosine_similarity(
normalize_image(np.array(frame)), normalize_image(np.array(image))
)
assert np.sum(np.isnan(sim)) / sim.size < 0.001
assert np.nanmean(sim) > 0.99
io_frames, _ = VideoMediaIO(ImageMediaIO()).load_file(Path(video_path))
for frame in io_frames:
sim = cosine_similarity(
normalize_image(np.array(frame)), normalize_image(np.array(image))
)
assert np.sum(np.isnan(sim)) / sim.size < 0.001
assert np.nanmean(sim) > 0.99
def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
"""
Regression test for handling videos with broken frames.
@@ -179,128 +84,6 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
)
@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_1")
class TestVideoBackendOverride1(VideoLoader):
"""Test loader that returns FAKE_OUTPUT_1 to verify backend selection."""
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, **kwargs
) -> tuple[npt.NDArray, dict]:
return FAKE_OUTPUT_1, {"video_backend": "test_video_backend_override_1"}
@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_2")
class TestVideoBackendOverride2(VideoLoader):
"""Test loader that returns FAKE_OUTPUT_2 to verify backend selection."""
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, **kwargs
) -> tuple[npt.NDArray, dict]:
return FAKE_OUTPUT_2, {"video_backend": "test_video_backend_override_2"}
def test_video_media_io_backend_kwarg_override(monkeypatch: pytest.MonkeyPatch):
"""
Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND
environment variable.
This allows users to dynamically select a different video backend
via --media-io-kwargs without changing the global env var, which is
useful when plugins set a default backend but a specific request
needs a different one.
"""
with monkeypatch.context() as m:
# Set the env var to one backend
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_1")
imageio = ImageMediaIO()
# Without video_backend kwarg, should use env var backend
videoio_default = VideoMediaIO(imageio, num_frames=10)
frames_default, metadata_default = videoio_default.load_bytes(b"test")
np.testing.assert_array_equal(frames_default, FAKE_OUTPUT_1)
assert metadata_default["video_backend"] == "test_video_backend_override_1"
# With video_backend kwarg, should override env var
videoio_override = VideoMediaIO(
imageio, num_frames=10, video_backend="test_video_backend_override_2"
)
frames_override, metadata_override = videoio_override.load_bytes(b"test")
np.testing.assert_array_equal(frames_override, FAKE_OUTPUT_2)
assert metadata_override["video_backend"] == "test_video_backend_override_2"
def test_video_media_io_backend_kwarg_not_passed_to_loader(
monkeypatch: pytest.MonkeyPatch,
):
"""
Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed
through to the underlying video loader's load_bytes method.
This ensures the kwarg is properly popped from kwargs before forwarding.
"""
@VIDEO_LOADER_REGISTRY.register("test_reject_video_backend_kwarg")
class RejectVideoBackendKwargLoader(VideoLoader):
"""Test loader that fails if video_backend is passed through."""
@classmethod
def load_bytes(
cls, data: bytes, num_frames: int = -1, **kwargs
) -> tuple[npt.NDArray, dict]:
# This should never receive video_backend in kwargs
if "video_backend" in kwargs:
raise AssertionError(
"video_backend should be consumed by VideoMediaIO, "
"not passed to loader"
)
return FAKE_OUTPUT_1, {"received_kwargs": list(kwargs.keys())}
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_reject_video_backend_kwarg")
imageio = ImageMediaIO()
# Even when video_backend is provided, it should NOT be passed to loader
videoio = VideoMediaIO(
imageio,
num_frames=10,
video_backend="test_reject_video_backend_kwarg",
other_kwarg="should_pass_through",
)
# This should NOT raise AssertionError
frames, metadata = videoio.load_bytes(b"test")
np.testing.assert_array_equal(frames, FAKE_OUTPUT_1)
# Verify other kwargs are still passed through
assert "other_kwarg" in metadata["received_kwargs"]
def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch):
"""
Test that when video_backend kwarg is None or not provided,
VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var.
"""
with monkeypatch.context() as m:
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_2")
imageio = ImageMediaIO()
# Explicit None should fall back to env var
videoio_none = VideoMediaIO(imageio, num_frames=10, video_backend=None)
frames_none, metadata_none = videoio_none.load_bytes(b"test")
np.testing.assert_array_equal(frames_none, FAKE_OUTPUT_2)
assert metadata_none["video_backend"] == "test_video_backend_override_2"
# Not providing video_backend should also fall back to env var
videoio_missing = VideoMediaIO(imageio, num_frames=10)
frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
assert metadata_missing["video_backend"] == "test_video_backend_override_2"
# ============================================================================
# Frame Recovery Tests
# ============================================================================

View File

@@ -27,7 +27,7 @@ ALLOWED_FILES = {
"vllm/distributed/device_communicators/shm_broadcast.py",
"vllm/distributed/device_communicators/shm_object_storage.py",
"vllm/utils/hashing.py",
"tests/multimodal/test_image.py",
"tests/multimodal/media/test_base.py",
"tests/tokenizers_/test_hf.py",
"tests/utils_/test_hashing.py",
"benchmarks/kernels/graph_machete_bench.py",

View File

@@ -1,32 +1,20 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
from dataclasses import dataclass
from enum import Enum
from io import BytesIO
from pathlib import Path
from typing import Literal
import numpy as np
import numpy.typing as npt
import pybase64
import torch
from vllm.utils.import_utils import PlaceholderModule
from vllm.utils.serial_utils import tensor2base64
from .base import MediaIO
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
try:
import soundfile
except ImportError:
soundfile = PlaceholderModule("soundfile") # type: ignore[assignment]
try:
import scipy.signal as scipy_signal
@@ -220,68 +208,3 @@ class AudioResampler:
f"Invalid resampling method: {self.method}. "
"Supported methods are 'librosa' and 'scipy'."
)
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
def __init__(self, **kwargs) -> None:
super().__init__()
# `kwargs` contains custom arguments from
# --media-io-kwargs for this modality.
# They can be passed to the underlying
# media loaders (e.g. custom implementations)
# for flexible control.
self.kwargs = kwargs
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
return librosa.load(BytesIO(data), sr=None)
def load_base64(
self,
media_type: str,
data: str,
) -> tuple[npt.NDArray, float]:
return self.load_bytes(base64.b64decode(data))
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
return librosa.load(filepath, sr=None)
def encode_base64(
self,
media: tuple[npt.NDArray, int],
*,
audio_format: str = "WAV",
) -> str:
audio, sr = media
with BytesIO() as buffer:
soundfile.write(buffer, audio, sr, format=audio_format)
data = buffer.getvalue()
return base64.b64encode(data).decode("utf-8")
class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
def __init__(self) -> None:
super().__init__()
def load_bytes(self, data: bytes) -> torch.Tensor:
buffer = BytesIO(data)
# Enable sparse tensor integrity checks to prevent out-of-bounds
# writes from maliciously crafted tensors
with torch.sparse.check_sparse_tensor_invariants():
tensor = torch.load(buffer, weights_only=True)
return tensor.to_dense()
def load_base64(self, media_type: str, data: str) -> torch.Tensor:
return self.load_bytes(pybase64.b64decode(data, validate=True))
def load_file(self, filepath: Path) -> torch.Tensor:
# Enable sparse tensor integrity checks to prevent out-of-bounds
# writes from maliciously crafted tensors
with torch.sparse.check_sparse_tensor_invariants():
tensor = torch.load(filepath, weights_only=True)
return tensor.to_dense()
def encode_base64(self, media: torch.Tensor) -> str:
return tensor2base64(media)

View File

@@ -12,7 +12,7 @@ from PIL import Image
from vllm.logger import init_logger
from .base import MediaWithBytes
from .media import MediaWithBytes
logger = init_logger(__name__)

View File

@@ -1,19 +1,8 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from io import BytesIO
from pathlib import Path
import pybase64
import torch
from PIL import Image
from vllm.logger import init_logger
from .base import MediaIO, MediaWithBytes
logger = init_logger(__file__)
def rescale_image_size(
image: Image.Image, size_factor: float, transpose: int = -1
@@ -45,111 +34,3 @@ def convert_image_mode(image: Image.Image, to_mode: str):
return rgba_to_rgb(image)
else:
return image.convert(to_mode)
class ImageMediaIO(MediaIO[Image.Image]):
def __init__(self, image_mode: str = "RGB", **kwargs) -> None:
super().__init__()
self.image_mode = image_mode
# `kwargs` contains custom arguments from
# --media-io-kwargs for this modality.
# They can be passed to the underlying
# media loaders (e.g. custom implementations)
# for flexible control.
self.kwargs = kwargs
# Extract RGBA background color from kwargs if provided
# Default to white background for backward compatibility
rgba_bg = kwargs.get("rgba_background_color", (255, 255, 255))
# Convert list to tuple for consistency
if isinstance(rgba_bg, list):
rgba_bg = tuple(rgba_bg)
# Validate rgba_background_color format
if not (
isinstance(rgba_bg, tuple)
and len(rgba_bg) == 3
and all(isinstance(c, int) and 0 <= c <= 255 for c in rgba_bg)
):
raise ValueError(
"rgba_background_color must be a list or tuple of 3 integers "
"in the range [0, 255]."
)
self.rgba_background_color = rgba_bg
def _convert_image_mode(
self, image: Image.Image | MediaWithBytes[Image.Image]
) -> Image.Image:
"""Convert image mode with custom background color."""
if isinstance(image, MediaWithBytes):
image = image.media
if image.mode == self.image_mode:
return image
elif image.mode == "RGBA" and self.image_mode == "RGB":
return rgba_to_rgb(image, self.rgba_background_color)
else:
return convert_image_mode(image, self.image_mode)
def load_bytes(self, data: bytes) -> MediaWithBytes[Image.Image]:
image = Image.open(BytesIO(data))
return MediaWithBytes(self._convert_image_mode(image), data)
def load_base64(self, media_type: str, data: str) -> MediaWithBytes[Image.Image]:
return self.load_bytes(pybase64.b64decode(data, validate=True))
def load_file(self, filepath: Path) -> MediaWithBytes[Image.Image]:
with open(filepath, "rb") as f:
data = f.read()
image = Image.open(BytesIO(data))
return MediaWithBytes(self._convert_image_mode(image), data)
def encode_base64(
self,
media: Image.Image,
*,
image_format: str | None = None,
) -> str:
if image_format is None:
logger.warning_once(
"The default format of `ImageMediaIO.encode_base64` will be changed "
'from "JPEG" to "PNG" in v0.15 to avoid lossy compression. '
"To continue using the old default, "
'pass `format="JPEG"` explicitly to silence this warning.'
)
image_format = "JPEG"
image = media
with BytesIO() as buffer:
image = self._convert_image_mode(image)
image.save(buffer, image_format)
data = buffer.getvalue()
return pybase64.b64encode(data).decode("utf-8")
class ImageEmbeddingMediaIO(MediaIO[torch.Tensor]):
def __init__(self) -> None:
super().__init__()
def load_bytes(self, data: bytes) -> torch.Tensor:
buffer = BytesIO(data)
# Enable sparse tensor integrity checks to prevent out-of-bounds
# writes from maliciously crafted tensors
with torch.sparse.check_sparse_tensor_invariants():
tensor = torch.load(buffer, weights_only=True)
return tensor.to_dense()
def load_base64(self, media_type: str, data: str) -> torch.Tensor:
return self.load_bytes(pybase64.b64decode(data, validate=True))
def load_file(self, filepath: Path) -> torch.Tensor:
# Enable sparse tensor integrity checks to prevent out-of-bounds
# writes from maliciously crafted tensors
with torch.sparse.check_sparse_tensor_invariants():
tensor = torch.load(filepath, weights_only=True)
return tensor.to_dense()
def encode_base64(self, media: torch.Tensor) -> str:
return pybase64.b64encode(media.numpy()).decode("utf-8")

View File

@@ -32,7 +32,7 @@ if TYPE_CHECKING:
from PIL.Image import Image
from transformers.feature_extraction_utils import BatchFeature
from .base import MediaWithBytes
from .media import MediaWithBytes
else:
torch = LazyLoader("torch", globals(), "torch")

View File

@@ -0,0 +1,16 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .audio import AudioEmbeddingMediaIO, AudioMediaIO
from .base import MediaIO, MediaWithBytes
from .image import ImageEmbeddingMediaIO, ImageMediaIO
from .video import VideoMediaIO
__all__ = [
"MediaIO",
"MediaWithBytes",
"AudioEmbeddingMediaIO",
"AudioMediaIO",
"ImageEmbeddingMediaIO",
"ImageMediaIO",
"VideoMediaIO",
]

View File

@@ -0,0 +1,89 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
from io import BytesIO
from pathlib import Path
import numpy.typing as npt
import pybase64
import torch
from vllm.utils.import_utils import PlaceholderModule
from vllm.utils.serial_utils import tensor2base64
from .base import MediaIO
try:
import librosa
except ImportError:
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
try:
import soundfile
except ImportError:
soundfile = PlaceholderModule("soundfile") # type: ignore[assignment]
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
def __init__(self, **kwargs) -> None:
super().__init__()
# `kwargs` contains custom arguments from
# --media-io-kwargs for this modality.
# They can be passed to the underlying
# media loaders (e.g. custom implementations)
# for flexible control.
self.kwargs = kwargs
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
return librosa.load(BytesIO(data), sr=None)
def load_base64(
self,
media_type: str,
data: str,
) -> tuple[npt.NDArray, float]:
return self.load_bytes(base64.b64decode(data))
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
return librosa.load(filepath, sr=None)
def encode_base64(
self,
media: tuple[npt.NDArray, int],
*,
audio_format: str = "WAV",
) -> str:
audio, sr = media
with BytesIO() as buffer:
soundfile.write(buffer, audio, sr, format=audio_format)
data = buffer.getvalue()
return base64.b64encode(data).decode("utf-8")
class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
def __init__(self) -> None:
super().__init__()
def load_bytes(self, data: bytes) -> torch.Tensor:
buffer = BytesIO(data)
# Enable sparse tensor integrity checks to prevent out-of-bounds
# writes from maliciously crafted tensors
with torch.sparse.check_sparse_tensor_invariants():
tensor = torch.load(buffer, weights_only=True)
return tensor.to_dense()
def load_base64(self, media_type: str, data: str) -> torch.Tensor:
return self.load_bytes(pybase64.b64decode(data, validate=True))
def load_file(self, filepath: Path) -> torch.Tensor:
# Enable sparse tensor integrity checks to prevent out-of-bounds
# writes from maliciously crafted tensors
with torch.sparse.check_sparse_tensor_invariants():
tensor = torch.load(filepath, weights_only=True)
return tensor.to_dense()
def encode_base64(self, media: torch.Tensor) -> str:
return tensor2base64(media)

View File

@@ -4,7 +4,7 @@
from abc import ABC, abstractmethod
from dataclasses import dataclass
from pathlib import Path
from typing import Generic, TypeVar
from typing import Any, Generic, TypeVar
import numpy as np
@@ -32,13 +32,14 @@ class MediaWithBytes(Generic[_T]):
"""Allow np.array(obj) to return np.array(obj.media)."""
return np.array(self.media, *args, **kwargs)
def __getstate__(self):
return self.__dict__.copy()
def __setstate__(self, state: dict[str, Any]):
self.__dict__.update(state)
def __getattr__(self, name: str):
"""Delegate attribute access to the underlying media object."""
# Guard against recursion during unpickling when media isn't set yet.
# pickle creates objects without calling __init__, so self.media may
# not exist when __getattr__ is called for methods like __setstate__.
if "media" not in self.__dict__:
raise AttributeError(name)
return getattr(self.media, name)

View File

@@ -0,0 +1,124 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from io import BytesIO
from pathlib import Path
import pybase64
import torch
from PIL import Image
from vllm.logger import init_logger
from ..image import convert_image_mode, rgba_to_rgb
from .base import MediaIO, MediaWithBytes
logger = init_logger(__file__)
class ImageMediaIO(MediaIO[Image.Image]):
def __init__(self, image_mode: str = "RGB", **kwargs) -> None:
super().__init__()
self.image_mode = image_mode
# `kwargs` contains custom arguments from
# --media-io-kwargs for this modality.
# They can be passed to the underlying
# media loaders (e.g. custom implementations)
# for flexible control.
self.kwargs = kwargs
# Extract RGBA background color from kwargs if provided
# Default to white background for backward compatibility
rgba_bg = kwargs.get("rgba_background_color", (255, 255, 255))
# Convert list to tuple for consistency
if isinstance(rgba_bg, list):
rgba_bg = tuple(rgba_bg)
# Validate rgba_background_color format
if not (
isinstance(rgba_bg, tuple)
and len(rgba_bg) == 3
and all(isinstance(c, int) and 0 <= c <= 255 for c in rgba_bg)
):
raise ValueError(
"rgba_background_color must be a list or tuple of 3 integers "
"in the range [0, 255]."
)
self.rgba_background_color = rgba_bg
def _convert_image_mode(
self, image: Image.Image | MediaWithBytes[Image.Image]
) -> Image.Image:
"""Convert image mode with custom background color."""
if isinstance(image, MediaWithBytes):
image = image.media
if image.mode == self.image_mode:
return image
elif image.mode == "RGBA" and self.image_mode == "RGB":
return rgba_to_rgb(image, self.rgba_background_color)
else:
return convert_image_mode(image, self.image_mode)
def load_bytes(self, data: bytes) -> MediaWithBytes[Image.Image]:
image = Image.open(BytesIO(data))
return MediaWithBytes(self._convert_image_mode(image), data)
def load_base64(self, media_type: str, data: str) -> MediaWithBytes[Image.Image]:
return self.load_bytes(pybase64.b64decode(data, validate=True))
def load_file(self, filepath: Path) -> MediaWithBytes[Image.Image]:
with open(filepath, "rb") as f:
data = f.read()
image = Image.open(BytesIO(data))
return MediaWithBytes(self._convert_image_mode(image), data)
def encode_base64(
self,
media: Image.Image,
*,
image_format: str | None = None,
) -> str:
if image_format is None:
logger.warning_once(
"The default format of `ImageMediaIO.encode_base64` will be changed "
'from "JPEG" to "PNG" in v0.15 to avoid lossy compression. '
"To continue using the old default, "
'pass `format="JPEG"` explicitly to silence this warning.'
)
image_format = "JPEG"
image = media
with BytesIO() as buffer:
image = self._convert_image_mode(image)
image.save(buffer, image_format)
data = buffer.getvalue()
return pybase64.b64encode(data).decode("utf-8")
class ImageEmbeddingMediaIO(MediaIO[torch.Tensor]):
def __init__(self) -> None:
super().__init__()
def load_bytes(self, data: bytes) -> torch.Tensor:
buffer = BytesIO(data)
# Enable sparse tensor integrity checks to prevent out-of-bounds
# writes from maliciously crafted tensors
with torch.sparse.check_sparse_tensor_invariants():
tensor = torch.load(buffer, weights_only=True)
return tensor.to_dense()
def load_base64(self, media_type: str, data: str) -> torch.Tensor:
return self.load_bytes(pybase64.b64decode(data, validate=True))
def load_file(self, filepath: Path) -> torch.Tensor:
# Enable sparse tensor integrity checks to prevent out-of-bounds
# writes from maliciously crafted tensors
with torch.sparse.check_sparse_tensor_invariants():
tensor = torch.load(filepath, weights_only=True)
return tensor.to_dense()
def encode_base64(self, media: torch.Tensor) -> str:
return pybase64.b64encode(media.numpy()).decode("utf-8")

View File

@@ -0,0 +1,89 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
from functools import partial
from pathlib import Path
from typing import Any
import numpy as np
import numpy.typing as npt
from PIL import Image
from vllm import envs
from ..video import VIDEO_LOADER_REGISTRY
from .base import MediaIO
from .image import ImageMediaIO
class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
def __init__(
self,
image_io: ImageMediaIO,
num_frames: int = 32,
**kwargs,
) -> None:
super().__init__()
self.image_io = image_io
self.num_frames = num_frames
# `kwargs` contains custom arguments from
# --media-io-kwargs for this modality.
# They can be passed to the underlying
# media loaders (e.g. custom implementations)
# for flexible control.
# Allow per-request override of video backend via kwargs.
# This enables users to specify a different backend than the
# global VLLM_VIDEO_LOADER_BACKEND env var, e.g.:
# --media-io-kwargs '{"video": {"video_backend": "torchcodec"}}'
video_loader_backend = (
kwargs.pop("video_backend", None) or envs.VLLM_VIDEO_LOADER_BACKEND
)
self.kwargs = kwargs
self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend)
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]:
return self.video_loader.load_bytes(
data, num_frames=self.num_frames, **self.kwargs
)
def load_base64(
self, media_type: str, data: str
) -> tuple[npt.NDArray, dict[str, Any]]:
if media_type.lower() == "video/jpeg":
load_frame = partial(
self.image_io.load_base64,
"image/jpeg",
)
return np.stack(
[np.asarray(load_frame(frame_data)) for frame_data in data.split(",")]
), {}
return self.load_bytes(base64.b64decode(data))
def load_file(self, filepath: Path) -> tuple[npt.NDArray, dict[str, Any]]:
with filepath.open("rb") as f:
data = f.read()
return self.load_bytes(data)
def encode_base64(
self,
media: npt.NDArray,
*,
video_format: str = "JPEG",
) -> str:
video = media
if video_format == "JPEG":
encode_frame = partial(
self.image_io.encode_base64,
image_format=video_format,
)
return ",".join(encode_frame(Image.fromarray(frame)) for frame in video)
msg = "Only JPEG format is supported for now."
raise NotImplementedError(msg)

View File

@@ -23,7 +23,6 @@ from vllm.utils.collection_utils import is_list_of
from vllm.utils.import_utils import LazyLoader
from .audio import AudioResampler, AudioSpec, normalize_audio
from .base import MediaWithBytes
from .inputs import (
AudioItem,
HfAudioItem,
@@ -36,6 +35,7 @@ from .inputs import (
MultiModalKwargsItems,
VideoItem,
)
from .media import MediaWithBytes
_T = TypeVar("_T")
_I = TypeVar("_I")

View File

@@ -22,10 +22,14 @@ from vllm.connections import HTTPConnection, global_http_connection
from vllm.logger import init_logger
from vllm.utils.registry import ExtensionManager
from .audio import AudioEmbeddingMediaIO, AudioMediaIO
from .base import MediaIO
from .image import ImageEmbeddingMediaIO, ImageMediaIO
from .video import VideoMediaIO
from .media import (
AudioEmbeddingMediaIO,
AudioMediaIO,
ImageEmbeddingMediaIO,
ImageMediaIO,
MediaIO,
VideoMediaIO,
)
if TYPE_CHECKING:
from .inputs import (

View File

@@ -1,27 +1,19 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import base64
import math
from abc import abstractmethod
from functools import partial
from io import BytesIO
from pathlib import Path
from typing import TYPE_CHECKING, Any, cast
import numpy as np
import numpy.typing as npt
from PIL import Image
if TYPE_CHECKING:
import cv2
from vllm import envs
from vllm.logger import init_logger
from vllm.utils.registry import ExtensionManager
from .base import MediaIO
from .image import ImageMediaIO
logger = init_logger(__name__)
@@ -755,76 +747,3 @@ class Molmo2VideoBackend(VideoLoader):
**kwargs,
)
return out
class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
def __init__(
self,
image_io: ImageMediaIO,
num_frames: int = 32,
**kwargs,
) -> None:
super().__init__()
self.image_io = image_io
self.num_frames = num_frames
# `kwargs` contains custom arguments from
# --media-io-kwargs for this modality.
# They can be passed to the underlying
# media loaders (e.g. custom implementations)
# for flexible control.
# Allow per-request override of video backend via kwargs.
# This enables users to specify a different backend than the
# global VLLM_VIDEO_LOADER_BACKEND env var, e.g.:
# --media-io-kwargs '{"video": {"video_backend": "torchcodec"}}'
video_loader_backend = (
kwargs.pop("video_backend", None) or envs.VLLM_VIDEO_LOADER_BACKEND
)
self.kwargs = kwargs
self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend)
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]:
return self.video_loader.load_bytes(
data, num_frames=self.num_frames, **self.kwargs
)
def load_base64(
self, media_type: str, data: str
) -> tuple[npt.NDArray, dict[str, Any]]:
if media_type.lower() == "video/jpeg":
load_frame = partial(
self.image_io.load_base64,
"image/jpeg",
)
return np.stack(
[np.asarray(load_frame(frame_data)) for frame_data in data.split(",")]
), {}
return self.load_bytes(base64.b64decode(data))
def load_file(self, filepath: Path) -> tuple[npt.NDArray, dict[str, Any]]:
with filepath.open("rb") as f:
data = f.read()
return self.load_bytes(data)
def encode_base64(
self,
media: npt.NDArray,
*,
video_format: str = "JPEG",
) -> str:
video = media
if video_format == "JPEG":
encode_frame = partial(
self.image_io.encode_base64,
image_format=video_format,
)
return ",".join(encode_frame(Image.fromarray(frame)) for frame in video)
msg = "Only JPEG format is supported for now."
raise NotImplementedError(msg)