[3/N] Group together media-related code (#32406)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
@@ -63,7 +63,7 @@ from vllm.distributed import (
|
||||
)
|
||||
from vllm.logger import init_logger
|
||||
from vllm.logprobs import Logprob
|
||||
from vllm.multimodal.base import MediaWithBytes
|
||||
from vllm.multimodal.media import MediaWithBytes
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import BeamSearchParams
|
||||
|
||||
@@ -14,8 +14,7 @@ import pytest
|
||||
import torch
|
||||
|
||||
from vllm.entrypoints.renderer import CompletionRenderer
|
||||
from vllm.multimodal.audio import AudioEmbeddingMediaIO
|
||||
from vllm.multimodal.image import ImageEmbeddingMediaIO
|
||||
from vllm.multimodal.media import AudioEmbeddingMediaIO, ImageEmbeddingMediaIO
|
||||
|
||||
|
||||
def _encode_tensor(tensor: torch.Tensor) -> bytes:
|
||||
|
||||
@@ -8,7 +8,7 @@ import pytest
|
||||
import pytest_asyncio
|
||||
from transformers import AutoProcessor
|
||||
|
||||
from vllm.multimodal.base import MediaWithBytes
|
||||
from vllm.multimodal.media import MediaWithBytes
|
||||
from vllm.multimodal.utils import encode_image_url, fetch_image
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
|
||||
@@ -9,7 +9,7 @@ from transformers import AutoProcessor
|
||||
|
||||
from tests.utils import VLLM_PATH, RemoteOpenAIServer
|
||||
from vllm.entrypoints.pooling.embed.protocol import EmbeddingResponse
|
||||
from vllm.multimodal.base import MediaWithBytes
|
||||
from vllm.multimodal.media import MediaWithBytes
|
||||
from vllm.multimodal.utils import fetch_image
|
||||
|
||||
MODEL_NAME = "TIGER-Lab/VLM2Vec-Full"
|
||||
|
||||
0
tests/multimodal/media/__init__.py
Normal file
0
tests/multimodal/media/__init__.py
Normal file
73
tests/multimodal/media/test_audio.py
Normal file
73
tests/multimodal/media/test_audio.py
Normal file
@@ -0,0 +1,73 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal.media import AudioMediaIO
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
ASSETS_DIR = Path(__file__).parent.parent / "assets"
|
||||
assert ASSETS_DIR.exists()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_audio():
|
||||
return np.array([0.0, 0.1, 0.2, 0.3, 0.4], dtype=float)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_audio_bytes():
|
||||
return b"FAKEAUDIOBYTES"
|
||||
|
||||
|
||||
def test_audio_media_io_load_bytes(dummy_audio_bytes):
|
||||
audio_io = AudioMediaIO()
|
||||
with patch("librosa.load") as mock_load:
|
||||
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
|
||||
out = audio_io.load_bytes(dummy_audio_bytes)
|
||||
mock_load.assert_called_once()
|
||||
assert isinstance(out[0], np.ndarray)
|
||||
assert out[1] == 16000
|
||||
|
||||
|
||||
def test_audio_media_io_load_base64(dummy_audio_bytes):
|
||||
audio_io = AudioMediaIO()
|
||||
encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8")
|
||||
with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes:
|
||||
mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000)
|
||||
out = audio_io.load_base64("audio/wav", encoded)
|
||||
mock_load_bytes.assert_called_once()
|
||||
assert isinstance(out[0], np.ndarray)
|
||||
assert out[1] == 16000
|
||||
|
||||
|
||||
def test_audio_media_io_load_file():
|
||||
audio_io = AudioMediaIO()
|
||||
path = Path("/fake/path.wav")
|
||||
with patch("librosa.load") as mock_load:
|
||||
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
|
||||
out = audio_io.load_file(path)
|
||||
mock_load.assert_called_once_with(path, sr=None)
|
||||
assert isinstance(out[0], np.ndarray)
|
||||
assert out[1] == 16000
|
||||
|
||||
|
||||
def test_audio_media_io_encode_base64(dummy_audio):
|
||||
audio_io = AudioMediaIO()
|
||||
media = (dummy_audio, 16000)
|
||||
with patch("soundfile.write") as mock_write:
|
||||
|
||||
def write_to_buffer(buffer, *_args, **_kwargs):
|
||||
buffer.write(b"dummy_wav_data")
|
||||
|
||||
mock_write.side_effect = write_to_buffer
|
||||
|
||||
out = audio_io.encode_base64(media)
|
||||
decoded = base64.b64decode(out)
|
||||
assert decoded == b"dummy_wav_data"
|
||||
mock_write.assert_called_once()
|
||||
45
tests/multimodal/media/test_base.py
Normal file
45
tests/multimodal/media/test_base.py
Normal file
@@ -0,0 +1,45 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.multimodal.media import MediaWithBytes
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
ASSETS_DIR = Path(__file__).parent.parent / "assets"
|
||||
assert ASSETS_DIR.exists()
|
||||
|
||||
|
||||
def test_media_with_bytes_pickle_roundtrip():
|
||||
"""Regression test for pickle/unpickle of MediaWithBytes.
|
||||
|
||||
Verifies that MediaWithBytes can be pickled and unpickled without
|
||||
RecursionError. See: https://github.com/vllm-project/vllm/issues/30818
|
||||
"""
|
||||
original_image = Image.open(ASSETS_DIR / "image1.png").convert("RGB")
|
||||
original_bytes = b"test_bytes_data"
|
||||
|
||||
wrapper = MediaWithBytes(media=original_image, original_bytes=original_bytes)
|
||||
|
||||
# Verify attribute delegation works before pickling
|
||||
assert wrapper.width == original_image.width
|
||||
assert wrapper.height == original_image.height
|
||||
assert wrapper.mode == original_image.mode
|
||||
|
||||
# Pickle and unpickle (this would cause RecursionError before the fix)
|
||||
pickled = pickle.dumps(wrapper)
|
||||
unpickled = pickle.loads(pickled)
|
||||
|
||||
# Verify the unpickled object works correctly
|
||||
assert unpickled.original_bytes == original_bytes
|
||||
assert unpickled.media.width == original_image.width
|
||||
assert unpickled.media.height == original_image.height
|
||||
|
||||
# Verify attribute delegation works after unpickling
|
||||
assert unpickled.width == original_image.width
|
||||
assert unpickled.height == original_image.height
|
||||
assert unpickled.mode == original_image.mode
|
||||
133
tests/multimodal/media/test_image.py
Normal file
133
tests/multimodal/media/test_image.py
Normal file
@@ -0,0 +1,133 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.multimodal.media import ImageMediaIO
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
ASSETS_DIR = Path(__file__).parent.parent / "assets"
|
||||
assert ASSETS_DIR.exists()
|
||||
|
||||
|
||||
def test_image_media_io_rgba_custom_background(tmp_path):
|
||||
"""Test RGBA to RGB conversion with custom background colors."""
|
||||
# Create a simple RGBA image with transparent and opaque pixels
|
||||
rgba_image = Image.new("RGBA", (10, 10), (255, 0, 0, 255)) # Red with full opacity
|
||||
|
||||
# Make top-left quadrant transparent
|
||||
for i in range(5):
|
||||
for j in range(5):
|
||||
rgba_image.putpixel((i, j), (0, 0, 0, 0)) # Fully transparent
|
||||
|
||||
# Save the test image to tmp_path
|
||||
test_image_path = tmp_path / "test_rgba.png"
|
||||
rgba_image.save(test_image_path)
|
||||
|
||||
# Test 1: Default white background (backward compatibility)
|
||||
image_io_default = ImageMediaIO()
|
||||
converted_default = image_io_default.load_file(test_image_path)
|
||||
default_numpy = np.array(converted_default)
|
||||
|
||||
# Check transparent pixels are white
|
||||
assert default_numpy[0][0][0] == 255 # R
|
||||
assert default_numpy[0][0][1] == 255 # G
|
||||
assert default_numpy[0][0][2] == 255 # B
|
||||
# Check opaque pixels remain red
|
||||
assert default_numpy[5][5][0] == 255 # R
|
||||
assert default_numpy[5][5][1] == 0 # G
|
||||
assert default_numpy[5][5][2] == 0 # B
|
||||
|
||||
# Test 2: Custom black background via kwargs
|
||||
image_io_black = ImageMediaIO(rgba_background_color=(0, 0, 0))
|
||||
converted_black = image_io_black.load_file(test_image_path)
|
||||
black_numpy = np.array(converted_black)
|
||||
|
||||
# Check transparent pixels are black
|
||||
assert black_numpy[0][0][0] == 0 # R
|
||||
assert black_numpy[0][0][1] == 0 # G
|
||||
assert black_numpy[0][0][2] == 0 # B
|
||||
# Check opaque pixels remain red
|
||||
assert black_numpy[5][5][0] == 255 # R
|
||||
assert black_numpy[5][5][1] == 0 # G
|
||||
assert black_numpy[5][5][2] == 0 # B
|
||||
|
||||
# Test 3: Custom blue background via kwargs (as list)
|
||||
image_io_blue = ImageMediaIO(rgba_background_color=[0, 0, 255])
|
||||
converted_blue = image_io_blue.load_file(test_image_path)
|
||||
blue_numpy = np.array(converted_blue)
|
||||
|
||||
# Check transparent pixels are blue
|
||||
assert blue_numpy[0][0][0] == 0 # R
|
||||
assert blue_numpy[0][0][1] == 0 # G
|
||||
assert blue_numpy[0][0][2] == 255 # B
|
||||
|
||||
# Test 4: Test with load_bytes method
|
||||
with open(test_image_path, "rb") as f:
|
||||
image_data = f.read()
|
||||
|
||||
image_io_green = ImageMediaIO(rgba_background_color=(0, 255, 0))
|
||||
converted_green = image_io_green.load_bytes(image_data)
|
||||
green_numpy = np.array(converted_green)
|
||||
|
||||
# Check transparent pixels are green
|
||||
assert green_numpy[0][0][0] == 0 # R
|
||||
assert green_numpy[0][0][1] == 255 # G
|
||||
assert green_numpy[0][0][2] == 0 # B
|
||||
|
||||
|
||||
def test_image_media_io_rgba_background_color_validation():
|
||||
"""Test that invalid rgba_background_color values are properly rejected."""
|
||||
|
||||
# Test invalid types
|
||||
with pytest.raises(
|
||||
ValueError, match="rgba_background_color must be a list or tuple"
|
||||
):
|
||||
ImageMediaIO(rgba_background_color="255,255,255")
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="rgba_background_color must be a list or tuple"
|
||||
):
|
||||
ImageMediaIO(rgba_background_color=255)
|
||||
|
||||
# Test wrong number of elements
|
||||
with pytest.raises(
|
||||
ValueError, match="rgba_background_color must be a list or tuple"
|
||||
):
|
||||
ImageMediaIO(rgba_background_color=(255, 255))
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="rgba_background_color must be a list or tuple"
|
||||
):
|
||||
ImageMediaIO(rgba_background_color=(255, 255, 255, 255))
|
||||
|
||||
# Test non-integer values
|
||||
with pytest.raises(
|
||||
ValueError, match="rgba_background_color must be a list or tuple"
|
||||
):
|
||||
ImageMediaIO(rgba_background_color=(255.0, 255.0, 255.0))
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="rgba_background_color must be a list or tuple"
|
||||
):
|
||||
ImageMediaIO(rgba_background_color=(255, "255", 255))
|
||||
|
||||
# Test out of range values
|
||||
with pytest.raises(
|
||||
ValueError, match="rgba_background_color must be a list or tuple"
|
||||
):
|
||||
ImageMediaIO(rgba_background_color=(256, 255, 255))
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="rgba_background_color must be a list or tuple"
|
||||
):
|
||||
ImageMediaIO(rgba_background_color=(255, -1, 255))
|
||||
|
||||
# Test that valid values work
|
||||
ImageMediaIO(rgba_background_color=(0, 0, 0)) # Should not raise
|
||||
ImageMediaIO(rgba_background_color=[255, 255, 255]) # Should not raise
|
||||
ImageMediaIO(rgba_background_color=(128, 128, 128)) # Should not raise
|
||||
237
tests/multimodal/media/test_video.py
Normal file
237
tests/multimodal/media/test_video.py
Normal file
@@ -0,0 +1,237 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.assets.base import get_vllm_public_assets
|
||||
from vllm.assets.video import video_to_ndarrays, video_to_pil_images_list
|
||||
from vllm.multimodal.media import ImageMediaIO, VideoMediaIO
|
||||
from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader
|
||||
|
||||
from ..utils import cosine_similarity, create_video_from_image, normalize_image
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
ASSETS_DIR = Path(__file__).parent.parent / "assets"
|
||||
assert ASSETS_DIR.exists()
|
||||
|
||||
|
||||
@VIDEO_LOADER_REGISTRY.register("assert_10_frames_1_fps")
|
||||
class Assert10Frames1FPSVideoLoader(VideoLoader):
|
||||
@classmethod
|
||||
def load_bytes(
|
||||
cls, data: bytes, num_frames: int = -1, fps: float = -1.0, **kwargs
|
||||
) -> npt.NDArray:
|
||||
assert num_frames == 10, "bad num_frames"
|
||||
assert fps == 1.0, "bad fps"
|
||||
return FAKE_OUTPUT_2
|
||||
|
||||
|
||||
def test_video_media_io_kwargs(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "assert_10_frames_1_fps")
|
||||
imageio = ImageMediaIO()
|
||||
|
||||
# Verify that different args pass/fail assertions as expected.
|
||||
videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 1.0})
|
||||
_ = videoio.load_bytes(b"test")
|
||||
|
||||
videoio = VideoMediaIO(
|
||||
imageio, **{"num_frames": 10, "fps": 1.0, "not_used": "not_used"}
|
||||
)
|
||||
_ = videoio.load_bytes(b"test")
|
||||
|
||||
with pytest.raises(AssertionError, match="bad num_frames"):
|
||||
videoio = VideoMediaIO(imageio, **{})
|
||||
_ = videoio.load_bytes(b"test")
|
||||
|
||||
with pytest.raises(AssertionError, match="bad num_frames"):
|
||||
videoio = VideoMediaIO(imageio, **{"num_frames": 9, "fps": 1.0})
|
||||
_ = videoio.load_bytes(b"test")
|
||||
|
||||
with pytest.raises(AssertionError, match="bad fps"):
|
||||
videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 2.0})
|
||||
_ = videoio.load_bytes(b"test")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_color", [True, False])
|
||||
@pytest.mark.parametrize("fourcc, ext", [("mp4v", "mp4"), ("XVID", "avi")])
|
||||
def test_opencv_video_io_colorspace(tmp_path, is_color: bool, fourcc: str, ext: str):
|
||||
"""
|
||||
Test all functions that use OpenCV for video I/O return RGB format.
|
||||
Both RGB and grayscale videos are tested.
|
||||
"""
|
||||
image_path = get_vllm_public_assets(
|
||||
filename="stop_sign.jpg", s3_prefix="vision_model_images"
|
||||
)
|
||||
image = Image.open(image_path)
|
||||
|
||||
if not is_color:
|
||||
image_path = f"{tmp_path}/test_grayscale_image.png"
|
||||
image = image.convert("L")
|
||||
image.save(image_path)
|
||||
# Convert to gray RGB for comparison
|
||||
image = image.convert("RGB")
|
||||
video_path = f"{tmp_path}/test_RGB_video.{ext}"
|
||||
create_video_from_image(
|
||||
image_path,
|
||||
video_path,
|
||||
num_frames=2,
|
||||
is_color=is_color,
|
||||
fourcc=fourcc,
|
||||
)
|
||||
|
||||
frames = video_to_ndarrays(video_path)
|
||||
for frame in frames:
|
||||
sim = cosine_similarity(
|
||||
normalize_image(np.array(frame)), normalize_image(np.array(image))
|
||||
)
|
||||
assert np.sum(np.isnan(sim)) / sim.size < 0.001
|
||||
assert np.nanmean(sim) > 0.99
|
||||
|
||||
pil_frames = video_to_pil_images_list(video_path)
|
||||
for frame in pil_frames:
|
||||
sim = cosine_similarity(
|
||||
normalize_image(np.array(frame)), normalize_image(np.array(image))
|
||||
)
|
||||
assert np.sum(np.isnan(sim)) / sim.size < 0.001
|
||||
assert np.nanmean(sim) > 0.99
|
||||
|
||||
io_frames, _ = VideoMediaIO(ImageMediaIO()).load_file(Path(video_path))
|
||||
for frame in io_frames:
|
||||
sim = cosine_similarity(
|
||||
normalize_image(np.array(frame)), normalize_image(np.array(image))
|
||||
)
|
||||
assert np.sum(np.isnan(sim)) / sim.size < 0.001
|
||||
assert np.nanmean(sim) > 0.99
|
||||
|
||||
|
||||
NUM_FRAMES = 10
|
||||
FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
|
||||
FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
|
||||
|
||||
|
||||
@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_1")
|
||||
class TestVideoBackendOverride1(VideoLoader):
|
||||
"""Test loader that returns FAKE_OUTPUT_1 to verify backend selection."""
|
||||
|
||||
@classmethod
|
||||
def load_bytes(
|
||||
cls, data: bytes, num_frames: int = -1, **kwargs
|
||||
) -> tuple[npt.NDArray, dict]:
|
||||
return FAKE_OUTPUT_1, {"video_backend": "test_video_backend_override_1"}
|
||||
|
||||
|
||||
@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_2")
|
||||
class TestVideoBackendOverride2(VideoLoader):
|
||||
"""Test loader that returns FAKE_OUTPUT_2 to verify backend selection."""
|
||||
|
||||
@classmethod
|
||||
def load_bytes(
|
||||
cls, data: bytes, num_frames: int = -1, **kwargs
|
||||
) -> tuple[npt.NDArray, dict]:
|
||||
return FAKE_OUTPUT_2, {"video_backend": "test_video_backend_override_2"}
|
||||
|
||||
|
||||
def test_video_media_io_backend_kwarg_override(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND
|
||||
environment variable.
|
||||
|
||||
This allows users to dynamically select a different video backend
|
||||
via --media-io-kwargs without changing the global env var, which is
|
||||
useful when plugins set a default backend but a specific request
|
||||
needs a different one.
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
# Set the env var to one backend
|
||||
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_1")
|
||||
|
||||
imageio = ImageMediaIO()
|
||||
|
||||
# Without video_backend kwarg, should use env var backend
|
||||
videoio_default = VideoMediaIO(imageio, num_frames=10)
|
||||
frames_default, metadata_default = videoio_default.load_bytes(b"test")
|
||||
np.testing.assert_array_equal(frames_default, FAKE_OUTPUT_1)
|
||||
assert metadata_default["video_backend"] == "test_video_backend_override_1"
|
||||
|
||||
# With video_backend kwarg, should override env var
|
||||
videoio_override = VideoMediaIO(
|
||||
imageio, num_frames=10, video_backend="test_video_backend_override_2"
|
||||
)
|
||||
frames_override, metadata_override = videoio_override.load_bytes(b"test")
|
||||
np.testing.assert_array_equal(frames_override, FAKE_OUTPUT_2)
|
||||
assert metadata_override["video_backend"] == "test_video_backend_override_2"
|
||||
|
||||
|
||||
def test_video_media_io_backend_kwarg_not_passed_to_loader(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""
|
||||
Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed
|
||||
through to the underlying video loader's load_bytes method.
|
||||
|
||||
This ensures the kwarg is properly popped from kwargs before forwarding.
|
||||
"""
|
||||
|
||||
@VIDEO_LOADER_REGISTRY.register("test_reject_video_backend_kwarg")
|
||||
class RejectVideoBackendKwargLoader(VideoLoader):
|
||||
"""Test loader that fails if video_backend is passed through."""
|
||||
|
||||
@classmethod
|
||||
def load_bytes(
|
||||
cls, data: bytes, num_frames: int = -1, **kwargs
|
||||
) -> tuple[npt.NDArray, dict]:
|
||||
# This should never receive video_backend in kwargs
|
||||
if "video_backend" in kwargs:
|
||||
raise AssertionError(
|
||||
"video_backend should be consumed by VideoMediaIO, "
|
||||
"not passed to loader"
|
||||
)
|
||||
return FAKE_OUTPUT_1, {"received_kwargs": list(kwargs.keys())}
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_reject_video_backend_kwarg")
|
||||
|
||||
imageio = ImageMediaIO()
|
||||
|
||||
# Even when video_backend is provided, it should NOT be passed to loader
|
||||
videoio = VideoMediaIO(
|
||||
imageio,
|
||||
num_frames=10,
|
||||
video_backend="test_reject_video_backend_kwarg",
|
||||
other_kwarg="should_pass_through",
|
||||
)
|
||||
|
||||
# This should NOT raise AssertionError
|
||||
frames, metadata = videoio.load_bytes(b"test")
|
||||
np.testing.assert_array_equal(frames, FAKE_OUTPUT_1)
|
||||
# Verify other kwargs are still passed through
|
||||
assert "other_kwarg" in metadata["received_kwargs"]
|
||||
|
||||
|
||||
def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
Test that when video_backend kwarg is None or not provided,
|
||||
VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var.
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_2")
|
||||
|
||||
imageio = ImageMediaIO()
|
||||
|
||||
# Explicit None should fall back to env var
|
||||
videoio_none = VideoMediaIO(imageio, num_frames=10, video_backend=None)
|
||||
frames_none, metadata_none = videoio_none.load_bytes(b"test")
|
||||
np.testing.assert_array_equal(frames_none, FAKE_OUTPUT_2)
|
||||
assert metadata_none["video_backend"] == "test_video_backend_override_2"
|
||||
|
||||
# Not providing video_backend should also fall back to env var
|
||||
videoio_missing = VideoMediaIO(imageio, num_frames=10)
|
||||
frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
|
||||
np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
|
||||
assert metadata_missing["video_backend"] == "test_video_backend_override_2"
|
||||
@@ -1,8 +1,6 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
# test_audio.py
|
||||
import base64
|
||||
from pathlib import Path
|
||||
from unittest.mock import patch
|
||||
|
||||
import numpy as np
|
||||
@@ -12,7 +10,6 @@ import torch
|
||||
from vllm.multimodal.audio import (
|
||||
MONO_AUDIO_SPEC,
|
||||
PASSTHROUGH_AUDIO_SPEC,
|
||||
AudioMediaIO,
|
||||
AudioResampler,
|
||||
AudioSpec,
|
||||
ChannelReduction,
|
||||
@@ -92,59 +89,6 @@ def test_audio_resampler_no_target_sr(dummy_audio):
|
||||
resampler.resample(dummy_audio, orig_sr=44100)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_audio_bytes():
|
||||
return b"FAKEAUDIOBYTES"
|
||||
|
||||
|
||||
def test_audio_media_io_load_bytes(dummy_audio_bytes):
|
||||
audio_io = AudioMediaIO()
|
||||
with patch("vllm.multimodal.audio.librosa.load") as mock_load:
|
||||
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
|
||||
out = audio_io.load_bytes(dummy_audio_bytes)
|
||||
mock_load.assert_called_once()
|
||||
assert isinstance(out[0], np.ndarray)
|
||||
assert out[1] == 16000
|
||||
|
||||
|
||||
def test_audio_media_io_load_base64(dummy_audio_bytes):
|
||||
audio_io = AudioMediaIO()
|
||||
encoded = base64.b64encode(dummy_audio_bytes).decode("utf-8")
|
||||
with patch.object(AudioMediaIO, "load_bytes") as mock_load_bytes:
|
||||
mock_load_bytes.return_value = (np.array([0.1, 0.2]), 16000)
|
||||
out = audio_io.load_base64("audio/wav", encoded)
|
||||
mock_load_bytes.assert_called_once()
|
||||
assert isinstance(out[0], np.ndarray)
|
||||
assert out[1] == 16000
|
||||
|
||||
|
||||
def test_audio_media_io_load_file():
|
||||
audio_io = AudioMediaIO()
|
||||
path = Path("/fake/path.wav")
|
||||
with patch("vllm.multimodal.audio.librosa.load") as mock_load:
|
||||
mock_load.return_value = (np.array([0.1, 0.2]), 16000)
|
||||
out = audio_io.load_file(path)
|
||||
mock_load.assert_called_once_with(path, sr=None)
|
||||
assert isinstance(out[0], np.ndarray)
|
||||
assert out[1] == 16000
|
||||
|
||||
|
||||
def test_audio_media_io_encode_base64(dummy_audio):
|
||||
audio_io = AudioMediaIO()
|
||||
media = (dummy_audio, 16000)
|
||||
with patch("vllm.multimodal.audio.soundfile.write") as mock_write:
|
||||
|
||||
def write_to_buffer(buffer, *_args, **_kwargs):
|
||||
buffer.write(b"dummy_wav_data")
|
||||
|
||||
mock_write.side_effect = write_to_buffer
|
||||
|
||||
out = audio_io.encode_base64(media)
|
||||
decoded = base64.b64decode(out)
|
||||
assert decoded == b"dummy_wav_data"
|
||||
mock_write.assert_called_once()
|
||||
|
||||
|
||||
# ============================================================
|
||||
# Tests for normalize_audio function
|
||||
# ============================================================
|
||||
|
||||
@@ -1,14 +1,12 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pickle
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from PIL import Image, ImageChops
|
||||
|
||||
from vllm.multimodal.base import MediaWithBytes
|
||||
from vllm.multimodal.image import ImageMediaIO, convert_image_mode
|
||||
from vllm.multimodal.image import convert_image_mode
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
@@ -40,153 +38,3 @@ def test_rgba_to_rgb():
|
||||
assert converted_image_numpy[i][j][0] == 255
|
||||
assert converted_image_numpy[i][j][1] == 255
|
||||
assert converted_image_numpy[i][j][2] == 255
|
||||
|
||||
|
||||
def test_rgba_to_rgb_custom_background(tmp_path):
|
||||
"""Test RGBA to RGB conversion with custom background colors."""
|
||||
# Create a simple RGBA image with transparent and opaque pixels
|
||||
rgba_image = Image.new("RGBA", (10, 10), (255, 0, 0, 255)) # Red with full opacity
|
||||
|
||||
# Make top-left quadrant transparent
|
||||
for i in range(5):
|
||||
for j in range(5):
|
||||
rgba_image.putpixel((i, j), (0, 0, 0, 0)) # Fully transparent
|
||||
|
||||
# Save the test image to tmp_path
|
||||
test_image_path = tmp_path / "test_rgba.png"
|
||||
rgba_image.save(test_image_path)
|
||||
|
||||
# Test 1: Default white background (backward compatibility)
|
||||
image_io_default = ImageMediaIO()
|
||||
converted_default = image_io_default.load_file(test_image_path)
|
||||
default_numpy = np.array(converted_default)
|
||||
|
||||
# Check transparent pixels are white
|
||||
assert default_numpy[0][0][0] == 255 # R
|
||||
assert default_numpy[0][0][1] == 255 # G
|
||||
assert default_numpy[0][0][2] == 255 # B
|
||||
# Check opaque pixels remain red
|
||||
assert default_numpy[5][5][0] == 255 # R
|
||||
assert default_numpy[5][5][1] == 0 # G
|
||||
assert default_numpy[5][5][2] == 0 # B
|
||||
|
||||
# Test 2: Custom black background via kwargs
|
||||
image_io_black = ImageMediaIO(rgba_background_color=(0, 0, 0))
|
||||
converted_black = image_io_black.load_file(test_image_path)
|
||||
black_numpy = np.array(converted_black)
|
||||
|
||||
# Check transparent pixels are black
|
||||
assert black_numpy[0][0][0] == 0 # R
|
||||
assert black_numpy[0][0][1] == 0 # G
|
||||
assert black_numpy[0][0][2] == 0 # B
|
||||
# Check opaque pixels remain red
|
||||
assert black_numpy[5][5][0] == 255 # R
|
||||
assert black_numpy[5][5][1] == 0 # G
|
||||
assert black_numpy[5][5][2] == 0 # B
|
||||
|
||||
# Test 3: Custom blue background via kwargs (as list)
|
||||
image_io_blue = ImageMediaIO(rgba_background_color=[0, 0, 255])
|
||||
converted_blue = image_io_blue.load_file(test_image_path)
|
||||
blue_numpy = np.array(converted_blue)
|
||||
|
||||
# Check transparent pixels are blue
|
||||
assert blue_numpy[0][0][0] == 0 # R
|
||||
assert blue_numpy[0][0][1] == 0 # G
|
||||
assert blue_numpy[0][0][2] == 255 # B
|
||||
|
||||
# Test 4: Test with load_bytes method
|
||||
with open(test_image_path, "rb") as f:
|
||||
image_data = f.read()
|
||||
|
||||
image_io_green = ImageMediaIO(rgba_background_color=(0, 255, 0))
|
||||
converted_green = image_io_green.load_bytes(image_data)
|
||||
green_numpy = np.array(converted_green)
|
||||
|
||||
# Check transparent pixels are green
|
||||
assert green_numpy[0][0][0] == 0 # R
|
||||
assert green_numpy[0][0][1] == 255 # G
|
||||
assert green_numpy[0][0][2] == 0 # B
|
||||
|
||||
|
||||
def test_rgba_background_color_validation():
|
||||
"""Test that invalid rgba_background_color values are properly rejected."""
|
||||
|
||||
# Test invalid types
|
||||
with pytest.raises(
|
||||
ValueError, match="rgba_background_color must be a list or tuple"
|
||||
):
|
||||
ImageMediaIO(rgba_background_color="255,255,255")
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="rgba_background_color must be a list or tuple"
|
||||
):
|
||||
ImageMediaIO(rgba_background_color=255)
|
||||
|
||||
# Test wrong number of elements
|
||||
with pytest.raises(
|
||||
ValueError, match="rgba_background_color must be a list or tuple"
|
||||
):
|
||||
ImageMediaIO(rgba_background_color=(255, 255))
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="rgba_background_color must be a list or tuple"
|
||||
):
|
||||
ImageMediaIO(rgba_background_color=(255, 255, 255, 255))
|
||||
|
||||
# Test non-integer values
|
||||
with pytest.raises(
|
||||
ValueError, match="rgba_background_color must be a list or tuple"
|
||||
):
|
||||
ImageMediaIO(rgba_background_color=(255.0, 255.0, 255.0))
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="rgba_background_color must be a list or tuple"
|
||||
):
|
||||
ImageMediaIO(rgba_background_color=(255, "255", 255))
|
||||
|
||||
# Test out of range values
|
||||
with pytest.raises(
|
||||
ValueError, match="rgba_background_color must be a list or tuple"
|
||||
):
|
||||
ImageMediaIO(rgba_background_color=(256, 255, 255))
|
||||
|
||||
with pytest.raises(
|
||||
ValueError, match="rgba_background_color must be a list or tuple"
|
||||
):
|
||||
ImageMediaIO(rgba_background_color=(255, -1, 255))
|
||||
|
||||
# Test that valid values work
|
||||
ImageMediaIO(rgba_background_color=(0, 0, 0)) # Should not raise
|
||||
ImageMediaIO(rgba_background_color=[255, 255, 255]) # Should not raise
|
||||
ImageMediaIO(rgba_background_color=(128, 128, 128)) # Should not raise
|
||||
|
||||
|
||||
def test_media_with_bytes_pickle_roundtrip():
|
||||
"""Regression test for pickle/unpickle of MediaWithBytes.
|
||||
|
||||
Verifies that MediaWithBytes can be pickled and unpickled without
|
||||
RecursionError. See: https://github.com/vllm-project/vllm/issues/30818
|
||||
"""
|
||||
original_image = Image.open(ASSETS_DIR / "image1.png").convert("RGB")
|
||||
original_bytes = b"test_bytes_data"
|
||||
|
||||
wrapper = MediaWithBytes(media=original_image, original_bytes=original_bytes)
|
||||
|
||||
# Verify attribute delegation works before pickling
|
||||
assert wrapper.width == original_image.width
|
||||
assert wrapper.height == original_image.height
|
||||
assert wrapper.mode == original_image.mode
|
||||
|
||||
# Pickle and unpickle (this would cause RecursionError before the fix)
|
||||
pickled = pickle.dumps(wrapper)
|
||||
unpickled = pickle.loads(pickled)
|
||||
|
||||
# Verify the unpickled object works correctly
|
||||
assert unpickled.original_bytes == original_bytes
|
||||
assert unpickled.media.width == original_image.width
|
||||
assert unpickled.media.height == original_image.height
|
||||
|
||||
# Verify attribute delegation works after unpickling
|
||||
assert unpickled.width == original_image.width
|
||||
assert unpickled.height == original_image.height
|
||||
assert unpickled.mode == original_image.mode
|
||||
|
||||
@@ -1,24 +1,19 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
import tempfile
|
||||
from pathlib import Path
|
||||
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.assets.base import get_vllm_public_assets
|
||||
from vllm.assets.video import video_to_ndarrays, video_to_pil_images_list
|
||||
from vllm.multimodal.image import ImageMediaIO
|
||||
from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader, VideoMediaIO
|
||||
|
||||
from .utils import cosine_similarity, create_video_from_image, normalize_image
|
||||
from vllm.multimodal.video import VIDEO_LOADER_REGISTRY, VideoLoader
|
||||
|
||||
pytestmark = pytest.mark.cpu_test
|
||||
|
||||
ASSETS_DIR = Path(__file__).parent / "assets"
|
||||
assert ASSETS_DIR.exists()
|
||||
|
||||
NUM_FRAMES = 10
|
||||
FAKE_OUTPUT_1 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
|
||||
FAKE_OUTPUT_2 = np.random.rand(NUM_FRAMES, 1280, 720, 3)
|
||||
@@ -53,96 +48,6 @@ def test_video_loader_type_doesnt_exist():
|
||||
VIDEO_LOADER_REGISTRY.load("non_existing_video_loader")
|
||||
|
||||
|
||||
@VIDEO_LOADER_REGISTRY.register("assert_10_frames_1_fps")
|
||||
class Assert10Frames1FPSVideoLoader(VideoLoader):
|
||||
@classmethod
|
||||
def load_bytes(
|
||||
cls, data: bytes, num_frames: int = -1, fps: float = -1.0, **kwargs
|
||||
) -> npt.NDArray:
|
||||
assert num_frames == 10, "bad num_frames"
|
||||
assert fps == 1.0, "bad fps"
|
||||
return FAKE_OUTPUT_2
|
||||
|
||||
|
||||
def test_video_media_io_kwargs(monkeypatch: pytest.MonkeyPatch):
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "assert_10_frames_1_fps")
|
||||
imageio = ImageMediaIO()
|
||||
|
||||
# Verify that different args pass/fail assertions as expected.
|
||||
videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 1.0})
|
||||
_ = videoio.load_bytes(b"test")
|
||||
|
||||
videoio = VideoMediaIO(
|
||||
imageio, **{"num_frames": 10, "fps": 1.0, "not_used": "not_used"}
|
||||
)
|
||||
_ = videoio.load_bytes(b"test")
|
||||
|
||||
with pytest.raises(AssertionError, match="bad num_frames"):
|
||||
videoio = VideoMediaIO(imageio, **{})
|
||||
_ = videoio.load_bytes(b"test")
|
||||
|
||||
with pytest.raises(AssertionError, match="bad num_frames"):
|
||||
videoio = VideoMediaIO(imageio, **{"num_frames": 9, "fps": 1.0})
|
||||
_ = videoio.load_bytes(b"test")
|
||||
|
||||
with pytest.raises(AssertionError, match="bad fps"):
|
||||
videoio = VideoMediaIO(imageio, **{"num_frames": 10, "fps": 2.0})
|
||||
_ = videoio.load_bytes(b"test")
|
||||
|
||||
|
||||
@pytest.mark.parametrize("is_color", [True, False])
|
||||
@pytest.mark.parametrize("fourcc, ext", [("mp4v", "mp4"), ("XVID", "avi")])
|
||||
def test_opencv_video_io_colorspace(is_color: bool, fourcc: str, ext: str):
|
||||
"""
|
||||
Test all functions that use OpenCV for video I/O return RGB format.
|
||||
Both RGB and grayscale videos are tested.
|
||||
"""
|
||||
image_path = get_vllm_public_assets(
|
||||
filename="stop_sign.jpg", s3_prefix="vision_model_images"
|
||||
)
|
||||
image = Image.open(image_path)
|
||||
with tempfile.TemporaryDirectory() as tmpdir:
|
||||
if not is_color:
|
||||
image_path = f"{tmpdir}/test_grayscale_image.png"
|
||||
image = image.convert("L")
|
||||
image.save(image_path)
|
||||
# Convert to gray RGB for comparison
|
||||
image = image.convert("RGB")
|
||||
video_path = f"{tmpdir}/test_RGB_video.{ext}"
|
||||
create_video_from_image(
|
||||
image_path,
|
||||
video_path,
|
||||
num_frames=2,
|
||||
is_color=is_color,
|
||||
fourcc=fourcc,
|
||||
)
|
||||
|
||||
frames = video_to_ndarrays(video_path)
|
||||
for frame in frames:
|
||||
sim = cosine_similarity(
|
||||
normalize_image(np.array(frame)), normalize_image(np.array(image))
|
||||
)
|
||||
assert np.sum(np.isnan(sim)) / sim.size < 0.001
|
||||
assert np.nanmean(sim) > 0.99
|
||||
|
||||
pil_frames = video_to_pil_images_list(video_path)
|
||||
for frame in pil_frames:
|
||||
sim = cosine_similarity(
|
||||
normalize_image(np.array(frame)), normalize_image(np.array(image))
|
||||
)
|
||||
assert np.sum(np.isnan(sim)) / sim.size < 0.001
|
||||
assert np.nanmean(sim) > 0.99
|
||||
|
||||
io_frames, _ = VideoMediaIO(ImageMediaIO()).load_file(Path(video_path))
|
||||
for frame in io_frames:
|
||||
sim = cosine_similarity(
|
||||
normalize_image(np.array(frame)), normalize_image(np.array(image))
|
||||
)
|
||||
assert np.sum(np.isnan(sim)) / sim.size < 0.001
|
||||
assert np.nanmean(sim) > 0.99
|
||||
|
||||
|
||||
def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
Regression test for handling videos with broken frames.
|
||||
@@ -179,128 +84,6 @@ def test_video_backend_handles_broken_frames(monkeypatch: pytest.MonkeyPatch):
|
||||
)
|
||||
|
||||
|
||||
@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_1")
|
||||
class TestVideoBackendOverride1(VideoLoader):
|
||||
"""Test loader that returns FAKE_OUTPUT_1 to verify backend selection."""
|
||||
|
||||
@classmethod
|
||||
def load_bytes(
|
||||
cls, data: bytes, num_frames: int = -1, **kwargs
|
||||
) -> tuple[npt.NDArray, dict]:
|
||||
return FAKE_OUTPUT_1, {"video_backend": "test_video_backend_override_1"}
|
||||
|
||||
|
||||
@VIDEO_LOADER_REGISTRY.register("test_video_backend_override_2")
|
||||
class TestVideoBackendOverride2(VideoLoader):
|
||||
"""Test loader that returns FAKE_OUTPUT_2 to verify backend selection."""
|
||||
|
||||
@classmethod
|
||||
def load_bytes(
|
||||
cls, data: bytes, num_frames: int = -1, **kwargs
|
||||
) -> tuple[npt.NDArray, dict]:
|
||||
return FAKE_OUTPUT_2, {"video_backend": "test_video_backend_override_2"}
|
||||
|
||||
|
||||
def test_video_media_io_backend_kwarg_override(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
Test that video_backend kwarg can override the VLLM_VIDEO_LOADER_BACKEND
|
||||
environment variable.
|
||||
|
||||
This allows users to dynamically select a different video backend
|
||||
via --media-io-kwargs without changing the global env var, which is
|
||||
useful when plugins set a default backend but a specific request
|
||||
needs a different one.
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
# Set the env var to one backend
|
||||
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_1")
|
||||
|
||||
imageio = ImageMediaIO()
|
||||
|
||||
# Without video_backend kwarg, should use env var backend
|
||||
videoio_default = VideoMediaIO(imageio, num_frames=10)
|
||||
frames_default, metadata_default = videoio_default.load_bytes(b"test")
|
||||
np.testing.assert_array_equal(frames_default, FAKE_OUTPUT_1)
|
||||
assert metadata_default["video_backend"] == "test_video_backend_override_1"
|
||||
|
||||
# With video_backend kwarg, should override env var
|
||||
videoio_override = VideoMediaIO(
|
||||
imageio, num_frames=10, video_backend="test_video_backend_override_2"
|
||||
)
|
||||
frames_override, metadata_override = videoio_override.load_bytes(b"test")
|
||||
np.testing.assert_array_equal(frames_override, FAKE_OUTPUT_2)
|
||||
assert metadata_override["video_backend"] == "test_video_backend_override_2"
|
||||
|
||||
|
||||
def test_video_media_io_backend_kwarg_not_passed_to_loader(
|
||||
monkeypatch: pytest.MonkeyPatch,
|
||||
):
|
||||
"""
|
||||
Test that video_backend kwarg is consumed by VideoMediaIO and NOT passed
|
||||
through to the underlying video loader's load_bytes method.
|
||||
|
||||
This ensures the kwarg is properly popped from kwargs before forwarding.
|
||||
"""
|
||||
|
||||
@VIDEO_LOADER_REGISTRY.register("test_reject_video_backend_kwarg")
|
||||
class RejectVideoBackendKwargLoader(VideoLoader):
|
||||
"""Test loader that fails if video_backend is passed through."""
|
||||
|
||||
@classmethod
|
||||
def load_bytes(
|
||||
cls, data: bytes, num_frames: int = -1, **kwargs
|
||||
) -> tuple[npt.NDArray, dict]:
|
||||
# This should never receive video_backend in kwargs
|
||||
if "video_backend" in kwargs:
|
||||
raise AssertionError(
|
||||
"video_backend should be consumed by VideoMediaIO, "
|
||||
"not passed to loader"
|
||||
)
|
||||
return FAKE_OUTPUT_1, {"received_kwargs": list(kwargs.keys())}
|
||||
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_reject_video_backend_kwarg")
|
||||
|
||||
imageio = ImageMediaIO()
|
||||
|
||||
# Even when video_backend is provided, it should NOT be passed to loader
|
||||
videoio = VideoMediaIO(
|
||||
imageio,
|
||||
num_frames=10,
|
||||
video_backend="test_reject_video_backend_kwarg",
|
||||
other_kwarg="should_pass_through",
|
||||
)
|
||||
|
||||
# This should NOT raise AssertionError
|
||||
frames, metadata = videoio.load_bytes(b"test")
|
||||
np.testing.assert_array_equal(frames, FAKE_OUTPUT_1)
|
||||
# Verify other kwargs are still passed through
|
||||
assert "other_kwarg" in metadata["received_kwargs"]
|
||||
|
||||
|
||||
def test_video_media_io_backend_env_var_fallback(monkeypatch: pytest.MonkeyPatch):
|
||||
"""
|
||||
Test that when video_backend kwarg is None or not provided,
|
||||
VideoMediaIO falls back to VLLM_VIDEO_LOADER_BACKEND env var.
|
||||
"""
|
||||
with monkeypatch.context() as m:
|
||||
m.setenv("VLLM_VIDEO_LOADER_BACKEND", "test_video_backend_override_2")
|
||||
|
||||
imageio = ImageMediaIO()
|
||||
|
||||
# Explicit None should fall back to env var
|
||||
videoio_none = VideoMediaIO(imageio, num_frames=10, video_backend=None)
|
||||
frames_none, metadata_none = videoio_none.load_bytes(b"test")
|
||||
np.testing.assert_array_equal(frames_none, FAKE_OUTPUT_2)
|
||||
assert metadata_none["video_backend"] == "test_video_backend_override_2"
|
||||
|
||||
# Not providing video_backend should also fall back to env var
|
||||
videoio_missing = VideoMediaIO(imageio, num_frames=10)
|
||||
frames_missing, metadata_missing = videoio_missing.load_bytes(b"test")
|
||||
np.testing.assert_array_equal(frames_missing, FAKE_OUTPUT_2)
|
||||
assert metadata_missing["video_backend"] == "test_video_backend_override_2"
|
||||
|
||||
|
||||
# ============================================================================
|
||||
# Frame Recovery Tests
|
||||
# ============================================================================
|
||||
|
||||
@@ -27,7 +27,7 @@ ALLOWED_FILES = {
|
||||
"vllm/distributed/device_communicators/shm_broadcast.py",
|
||||
"vllm/distributed/device_communicators/shm_object_storage.py",
|
||||
"vllm/utils/hashing.py",
|
||||
"tests/multimodal/test_image.py",
|
||||
"tests/multimodal/media/test_base.py",
|
||||
"tests/tokenizers_/test_hf.py",
|
||||
"tests/utils_/test_hashing.py",
|
||||
"benchmarks/kernels/graph_machete_bench.py",
|
||||
|
||||
@@ -1,32 +1,20 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import base64
|
||||
from dataclasses import dataclass
|
||||
from enum import Enum
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import Literal
|
||||
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
import pybase64
|
||||
import torch
|
||||
|
||||
from vllm.utils.import_utils import PlaceholderModule
|
||||
from vllm.utils.serial_utils import tensor2base64
|
||||
|
||||
from .base import MediaIO
|
||||
|
||||
try:
|
||||
import librosa
|
||||
except ImportError:
|
||||
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
import soundfile
|
||||
except ImportError:
|
||||
soundfile = PlaceholderModule("soundfile") # type: ignore[assignment]
|
||||
|
||||
|
||||
try:
|
||||
import scipy.signal as scipy_signal
|
||||
@@ -220,68 +208,3 @@ class AudioResampler:
|
||||
f"Invalid resampling method: {self.method}. "
|
||||
"Supported methods are 'librosa' and 'scipy'."
|
||||
)
|
||||
|
||||
|
||||
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
|
||||
def __init__(self, **kwargs) -> None:
|
||||
super().__init__()
|
||||
|
||||
# `kwargs` contains custom arguments from
|
||||
# --media-io-kwargs for this modality.
|
||||
# They can be passed to the underlying
|
||||
# media loaders (e.g. custom implementations)
|
||||
# for flexible control.
|
||||
self.kwargs = kwargs
|
||||
|
||||
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
|
||||
return librosa.load(BytesIO(data), sr=None)
|
||||
|
||||
def load_base64(
|
||||
self,
|
||||
media_type: str,
|
||||
data: str,
|
||||
) -> tuple[npt.NDArray, float]:
|
||||
return self.load_bytes(base64.b64decode(data))
|
||||
|
||||
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
|
||||
return librosa.load(filepath, sr=None)
|
||||
|
||||
def encode_base64(
|
||||
self,
|
||||
media: tuple[npt.NDArray, int],
|
||||
*,
|
||||
audio_format: str = "WAV",
|
||||
) -> str:
|
||||
audio, sr = media
|
||||
|
||||
with BytesIO() as buffer:
|
||||
soundfile.write(buffer, audio, sr, format=audio_format)
|
||||
data = buffer.getvalue()
|
||||
|
||||
return base64.b64encode(data).decode("utf-8")
|
||||
|
||||
|
||||
class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def load_bytes(self, data: bytes) -> torch.Tensor:
|
||||
buffer = BytesIO(data)
|
||||
# Enable sparse tensor integrity checks to prevent out-of-bounds
|
||||
# writes from maliciously crafted tensors
|
||||
with torch.sparse.check_sparse_tensor_invariants():
|
||||
tensor = torch.load(buffer, weights_only=True)
|
||||
return tensor.to_dense()
|
||||
|
||||
def load_base64(self, media_type: str, data: str) -> torch.Tensor:
|
||||
return self.load_bytes(pybase64.b64decode(data, validate=True))
|
||||
|
||||
def load_file(self, filepath: Path) -> torch.Tensor:
|
||||
# Enable sparse tensor integrity checks to prevent out-of-bounds
|
||||
# writes from maliciously crafted tensors
|
||||
with torch.sparse.check_sparse_tensor_invariants():
|
||||
tensor = torch.load(filepath, weights_only=True)
|
||||
return tensor.to_dense()
|
||||
|
||||
def encode_base64(self, media: torch.Tensor) -> str:
|
||||
return tensor2base64(media)
|
||||
|
||||
@@ -12,7 +12,7 @@ from PIL import Image
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .base import MediaWithBytes
|
||||
from .media import MediaWithBytes
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@@ -1,19 +1,8 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import pybase64
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from .base import MediaIO, MediaWithBytes
|
||||
|
||||
logger = init_logger(__file__)
|
||||
|
||||
|
||||
def rescale_image_size(
|
||||
image: Image.Image, size_factor: float, transpose: int = -1
|
||||
@@ -45,111 +34,3 @@ def convert_image_mode(image: Image.Image, to_mode: str):
|
||||
return rgba_to_rgb(image)
|
||||
else:
|
||||
return image.convert(to_mode)
|
||||
|
||||
|
||||
class ImageMediaIO(MediaIO[Image.Image]):
|
||||
def __init__(self, image_mode: str = "RGB", **kwargs) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.image_mode = image_mode
|
||||
# `kwargs` contains custom arguments from
|
||||
# --media-io-kwargs for this modality.
|
||||
# They can be passed to the underlying
|
||||
# media loaders (e.g. custom implementations)
|
||||
# for flexible control.
|
||||
self.kwargs = kwargs
|
||||
|
||||
# Extract RGBA background color from kwargs if provided
|
||||
# Default to white background for backward compatibility
|
||||
rgba_bg = kwargs.get("rgba_background_color", (255, 255, 255))
|
||||
# Convert list to tuple for consistency
|
||||
if isinstance(rgba_bg, list):
|
||||
rgba_bg = tuple(rgba_bg)
|
||||
|
||||
# Validate rgba_background_color format
|
||||
if not (
|
||||
isinstance(rgba_bg, tuple)
|
||||
and len(rgba_bg) == 3
|
||||
and all(isinstance(c, int) and 0 <= c <= 255 for c in rgba_bg)
|
||||
):
|
||||
raise ValueError(
|
||||
"rgba_background_color must be a list or tuple of 3 integers "
|
||||
"in the range [0, 255]."
|
||||
)
|
||||
self.rgba_background_color = rgba_bg
|
||||
|
||||
def _convert_image_mode(
|
||||
self, image: Image.Image | MediaWithBytes[Image.Image]
|
||||
) -> Image.Image:
|
||||
"""Convert image mode with custom background color."""
|
||||
if isinstance(image, MediaWithBytes):
|
||||
image = image.media
|
||||
if image.mode == self.image_mode:
|
||||
return image
|
||||
elif image.mode == "RGBA" and self.image_mode == "RGB":
|
||||
return rgba_to_rgb(image, self.rgba_background_color)
|
||||
else:
|
||||
return convert_image_mode(image, self.image_mode)
|
||||
|
||||
def load_bytes(self, data: bytes) -> MediaWithBytes[Image.Image]:
|
||||
image = Image.open(BytesIO(data))
|
||||
return MediaWithBytes(self._convert_image_mode(image), data)
|
||||
|
||||
def load_base64(self, media_type: str, data: str) -> MediaWithBytes[Image.Image]:
|
||||
return self.load_bytes(pybase64.b64decode(data, validate=True))
|
||||
|
||||
def load_file(self, filepath: Path) -> MediaWithBytes[Image.Image]:
|
||||
with open(filepath, "rb") as f:
|
||||
data = f.read()
|
||||
image = Image.open(BytesIO(data))
|
||||
return MediaWithBytes(self._convert_image_mode(image), data)
|
||||
|
||||
def encode_base64(
|
||||
self,
|
||||
media: Image.Image,
|
||||
*,
|
||||
image_format: str | None = None,
|
||||
) -> str:
|
||||
if image_format is None:
|
||||
logger.warning_once(
|
||||
"The default format of `ImageMediaIO.encode_base64` will be changed "
|
||||
'from "JPEG" to "PNG" in v0.15 to avoid lossy compression. '
|
||||
"To continue using the old default, "
|
||||
'pass `format="JPEG"` explicitly to silence this warning.'
|
||||
)
|
||||
image_format = "JPEG"
|
||||
|
||||
image = media
|
||||
|
||||
with BytesIO() as buffer:
|
||||
image = self._convert_image_mode(image)
|
||||
image.save(buffer, image_format)
|
||||
data = buffer.getvalue()
|
||||
|
||||
return pybase64.b64encode(data).decode("utf-8")
|
||||
|
||||
|
||||
class ImageEmbeddingMediaIO(MediaIO[torch.Tensor]):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def load_bytes(self, data: bytes) -> torch.Tensor:
|
||||
buffer = BytesIO(data)
|
||||
# Enable sparse tensor integrity checks to prevent out-of-bounds
|
||||
# writes from maliciously crafted tensors
|
||||
with torch.sparse.check_sparse_tensor_invariants():
|
||||
tensor = torch.load(buffer, weights_only=True)
|
||||
return tensor.to_dense()
|
||||
|
||||
def load_base64(self, media_type: str, data: str) -> torch.Tensor:
|
||||
return self.load_bytes(pybase64.b64decode(data, validate=True))
|
||||
|
||||
def load_file(self, filepath: Path) -> torch.Tensor:
|
||||
# Enable sparse tensor integrity checks to prevent out-of-bounds
|
||||
# writes from maliciously crafted tensors
|
||||
with torch.sparse.check_sparse_tensor_invariants():
|
||||
tensor = torch.load(filepath, weights_only=True)
|
||||
return tensor.to_dense()
|
||||
|
||||
def encode_base64(self, media: torch.Tensor) -> str:
|
||||
return pybase64.b64encode(media.numpy()).decode("utf-8")
|
||||
|
||||
@@ -32,7 +32,7 @@ if TYPE_CHECKING:
|
||||
from PIL.Image import Image
|
||||
from transformers.feature_extraction_utils import BatchFeature
|
||||
|
||||
from .base import MediaWithBytes
|
||||
from .media import MediaWithBytes
|
||||
else:
|
||||
torch = LazyLoader("torch", globals(), "torch")
|
||||
|
||||
|
||||
16
vllm/multimodal/media/__init__.py
Normal file
16
vllm/multimodal/media/__init__.py
Normal file
@@ -0,0 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
from .audio import AudioEmbeddingMediaIO, AudioMediaIO
|
||||
from .base import MediaIO, MediaWithBytes
|
||||
from .image import ImageEmbeddingMediaIO, ImageMediaIO
|
||||
from .video import VideoMediaIO
|
||||
|
||||
__all__ = [
|
||||
"MediaIO",
|
||||
"MediaWithBytes",
|
||||
"AudioEmbeddingMediaIO",
|
||||
"AudioMediaIO",
|
||||
"ImageEmbeddingMediaIO",
|
||||
"ImageMediaIO",
|
||||
"VideoMediaIO",
|
||||
]
|
||||
89
vllm/multimodal/media/audio.py
Normal file
89
vllm/multimodal/media/audio.py
Normal file
@@ -0,0 +1,89 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import base64
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import numpy.typing as npt
|
||||
import pybase64
|
||||
import torch
|
||||
|
||||
from vllm.utils.import_utils import PlaceholderModule
|
||||
from vllm.utils.serial_utils import tensor2base64
|
||||
|
||||
from .base import MediaIO
|
||||
|
||||
try:
|
||||
import librosa
|
||||
except ImportError:
|
||||
librosa = PlaceholderModule("librosa") # type: ignore[assignment]
|
||||
|
||||
try:
|
||||
import soundfile
|
||||
except ImportError:
|
||||
soundfile = PlaceholderModule("soundfile") # type: ignore[assignment]
|
||||
|
||||
|
||||
class AudioMediaIO(MediaIO[tuple[npt.NDArray, float]]):
|
||||
def __init__(self, **kwargs) -> None:
|
||||
super().__init__()
|
||||
|
||||
# `kwargs` contains custom arguments from
|
||||
# --media-io-kwargs for this modality.
|
||||
# They can be passed to the underlying
|
||||
# media loaders (e.g. custom implementations)
|
||||
# for flexible control.
|
||||
self.kwargs = kwargs
|
||||
|
||||
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, float]:
|
||||
return librosa.load(BytesIO(data), sr=None)
|
||||
|
||||
def load_base64(
|
||||
self,
|
||||
media_type: str,
|
||||
data: str,
|
||||
) -> tuple[npt.NDArray, float]:
|
||||
return self.load_bytes(base64.b64decode(data))
|
||||
|
||||
def load_file(self, filepath: Path) -> tuple[npt.NDArray, float]:
|
||||
return librosa.load(filepath, sr=None)
|
||||
|
||||
def encode_base64(
|
||||
self,
|
||||
media: tuple[npt.NDArray, int],
|
||||
*,
|
||||
audio_format: str = "WAV",
|
||||
) -> str:
|
||||
audio, sr = media
|
||||
|
||||
with BytesIO() as buffer:
|
||||
soundfile.write(buffer, audio, sr, format=audio_format)
|
||||
data = buffer.getvalue()
|
||||
|
||||
return base64.b64encode(data).decode("utf-8")
|
||||
|
||||
|
||||
class AudioEmbeddingMediaIO(MediaIO[torch.Tensor]):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def load_bytes(self, data: bytes) -> torch.Tensor:
|
||||
buffer = BytesIO(data)
|
||||
# Enable sparse tensor integrity checks to prevent out-of-bounds
|
||||
# writes from maliciously crafted tensors
|
||||
with torch.sparse.check_sparse_tensor_invariants():
|
||||
tensor = torch.load(buffer, weights_only=True)
|
||||
return tensor.to_dense()
|
||||
|
||||
def load_base64(self, media_type: str, data: str) -> torch.Tensor:
|
||||
return self.load_bytes(pybase64.b64decode(data, validate=True))
|
||||
|
||||
def load_file(self, filepath: Path) -> torch.Tensor:
|
||||
# Enable sparse tensor integrity checks to prevent out-of-bounds
|
||||
# writes from maliciously crafted tensors
|
||||
with torch.sparse.check_sparse_tensor_invariants():
|
||||
tensor = torch.load(filepath, weights_only=True)
|
||||
return tensor.to_dense()
|
||||
|
||||
def encode_base64(self, media: torch.Tensor) -> str:
|
||||
return tensor2base64(media)
|
||||
@@ -4,7 +4,7 @@
|
||||
from abc import ABC, abstractmethod
|
||||
from dataclasses import dataclass
|
||||
from pathlib import Path
|
||||
from typing import Generic, TypeVar
|
||||
from typing import Any, Generic, TypeVar
|
||||
|
||||
import numpy as np
|
||||
|
||||
@@ -32,13 +32,14 @@ class MediaWithBytes(Generic[_T]):
|
||||
"""Allow np.array(obj) to return np.array(obj.media)."""
|
||||
return np.array(self.media, *args, **kwargs)
|
||||
|
||||
def __getstate__(self):
|
||||
return self.__dict__.copy()
|
||||
|
||||
def __setstate__(self, state: dict[str, Any]):
|
||||
self.__dict__.update(state)
|
||||
|
||||
def __getattr__(self, name: str):
|
||||
"""Delegate attribute access to the underlying media object."""
|
||||
# Guard against recursion during unpickling when media isn't set yet.
|
||||
# pickle creates objects without calling __init__, so self.media may
|
||||
# not exist when __getattr__ is called for methods like __setstate__.
|
||||
if "media" not in self.__dict__:
|
||||
raise AttributeError(name)
|
||||
return getattr(self.media, name)
|
||||
|
||||
|
||||
124
vllm/multimodal/media/image.py
Normal file
124
vllm/multimodal/media/image.py
Normal file
@@ -0,0 +1,124 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
|
||||
import pybase64
|
||||
import torch
|
||||
from PIL import Image
|
||||
|
||||
from vllm.logger import init_logger
|
||||
|
||||
from ..image import convert_image_mode, rgba_to_rgb
|
||||
from .base import MediaIO, MediaWithBytes
|
||||
|
||||
logger = init_logger(__file__)
|
||||
|
||||
|
||||
class ImageMediaIO(MediaIO[Image.Image]):
|
||||
def __init__(self, image_mode: str = "RGB", **kwargs) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.image_mode = image_mode
|
||||
# `kwargs` contains custom arguments from
|
||||
# --media-io-kwargs for this modality.
|
||||
# They can be passed to the underlying
|
||||
# media loaders (e.g. custom implementations)
|
||||
# for flexible control.
|
||||
self.kwargs = kwargs
|
||||
|
||||
# Extract RGBA background color from kwargs if provided
|
||||
# Default to white background for backward compatibility
|
||||
rgba_bg = kwargs.get("rgba_background_color", (255, 255, 255))
|
||||
# Convert list to tuple for consistency
|
||||
if isinstance(rgba_bg, list):
|
||||
rgba_bg = tuple(rgba_bg)
|
||||
|
||||
# Validate rgba_background_color format
|
||||
if not (
|
||||
isinstance(rgba_bg, tuple)
|
||||
and len(rgba_bg) == 3
|
||||
and all(isinstance(c, int) and 0 <= c <= 255 for c in rgba_bg)
|
||||
):
|
||||
raise ValueError(
|
||||
"rgba_background_color must be a list or tuple of 3 integers "
|
||||
"in the range [0, 255]."
|
||||
)
|
||||
self.rgba_background_color = rgba_bg
|
||||
|
||||
def _convert_image_mode(
|
||||
self, image: Image.Image | MediaWithBytes[Image.Image]
|
||||
) -> Image.Image:
|
||||
"""Convert image mode with custom background color."""
|
||||
if isinstance(image, MediaWithBytes):
|
||||
image = image.media
|
||||
if image.mode == self.image_mode:
|
||||
return image
|
||||
elif image.mode == "RGBA" and self.image_mode == "RGB":
|
||||
return rgba_to_rgb(image, self.rgba_background_color)
|
||||
else:
|
||||
return convert_image_mode(image, self.image_mode)
|
||||
|
||||
def load_bytes(self, data: bytes) -> MediaWithBytes[Image.Image]:
|
||||
image = Image.open(BytesIO(data))
|
||||
return MediaWithBytes(self._convert_image_mode(image), data)
|
||||
|
||||
def load_base64(self, media_type: str, data: str) -> MediaWithBytes[Image.Image]:
|
||||
return self.load_bytes(pybase64.b64decode(data, validate=True))
|
||||
|
||||
def load_file(self, filepath: Path) -> MediaWithBytes[Image.Image]:
|
||||
with open(filepath, "rb") as f:
|
||||
data = f.read()
|
||||
image = Image.open(BytesIO(data))
|
||||
return MediaWithBytes(self._convert_image_mode(image), data)
|
||||
|
||||
def encode_base64(
|
||||
self,
|
||||
media: Image.Image,
|
||||
*,
|
||||
image_format: str | None = None,
|
||||
) -> str:
|
||||
if image_format is None:
|
||||
logger.warning_once(
|
||||
"The default format of `ImageMediaIO.encode_base64` will be changed "
|
||||
'from "JPEG" to "PNG" in v0.15 to avoid lossy compression. '
|
||||
"To continue using the old default, "
|
||||
'pass `format="JPEG"` explicitly to silence this warning.'
|
||||
)
|
||||
image_format = "JPEG"
|
||||
|
||||
image = media
|
||||
|
||||
with BytesIO() as buffer:
|
||||
image = self._convert_image_mode(image)
|
||||
image.save(buffer, image_format)
|
||||
data = buffer.getvalue()
|
||||
|
||||
return pybase64.b64encode(data).decode("utf-8")
|
||||
|
||||
|
||||
class ImageEmbeddingMediaIO(MediaIO[torch.Tensor]):
|
||||
def __init__(self) -> None:
|
||||
super().__init__()
|
||||
|
||||
def load_bytes(self, data: bytes) -> torch.Tensor:
|
||||
buffer = BytesIO(data)
|
||||
# Enable sparse tensor integrity checks to prevent out-of-bounds
|
||||
# writes from maliciously crafted tensors
|
||||
with torch.sparse.check_sparse_tensor_invariants():
|
||||
tensor = torch.load(buffer, weights_only=True)
|
||||
return tensor.to_dense()
|
||||
|
||||
def load_base64(self, media_type: str, data: str) -> torch.Tensor:
|
||||
return self.load_bytes(pybase64.b64decode(data, validate=True))
|
||||
|
||||
def load_file(self, filepath: Path) -> torch.Tensor:
|
||||
# Enable sparse tensor integrity checks to prevent out-of-bounds
|
||||
# writes from maliciously crafted tensors
|
||||
with torch.sparse.check_sparse_tensor_invariants():
|
||||
tensor = torch.load(filepath, weights_only=True)
|
||||
return tensor.to_dense()
|
||||
|
||||
def encode_base64(self, media: torch.Tensor) -> str:
|
||||
return pybase64.b64encode(media.numpy()).decode("utf-8")
|
||||
89
vllm/multimodal/media/video.py
Normal file
89
vllm/multimodal/media/video.py
Normal file
@@ -0,0 +1,89 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import base64
|
||||
from functools import partial
|
||||
from pathlib import Path
|
||||
from typing import Any
|
||||
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
from PIL import Image
|
||||
|
||||
from vllm import envs
|
||||
|
||||
from ..video import VIDEO_LOADER_REGISTRY
|
||||
from .base import MediaIO
|
||||
from .image import ImageMediaIO
|
||||
|
||||
|
||||
class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
|
||||
def __init__(
|
||||
self,
|
||||
image_io: ImageMediaIO,
|
||||
num_frames: int = 32,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.image_io = image_io
|
||||
self.num_frames = num_frames
|
||||
# `kwargs` contains custom arguments from
|
||||
# --media-io-kwargs for this modality.
|
||||
# They can be passed to the underlying
|
||||
# media loaders (e.g. custom implementations)
|
||||
# for flexible control.
|
||||
|
||||
# Allow per-request override of video backend via kwargs.
|
||||
# This enables users to specify a different backend than the
|
||||
# global VLLM_VIDEO_LOADER_BACKEND env var, e.g.:
|
||||
# --media-io-kwargs '{"video": {"video_backend": "torchcodec"}}'
|
||||
video_loader_backend = (
|
||||
kwargs.pop("video_backend", None) or envs.VLLM_VIDEO_LOADER_BACKEND
|
||||
)
|
||||
self.kwargs = kwargs
|
||||
self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend)
|
||||
|
||||
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]:
|
||||
return self.video_loader.load_bytes(
|
||||
data, num_frames=self.num_frames, **self.kwargs
|
||||
)
|
||||
|
||||
def load_base64(
|
||||
self, media_type: str, data: str
|
||||
) -> tuple[npt.NDArray, dict[str, Any]]:
|
||||
if media_type.lower() == "video/jpeg":
|
||||
load_frame = partial(
|
||||
self.image_io.load_base64,
|
||||
"image/jpeg",
|
||||
)
|
||||
|
||||
return np.stack(
|
||||
[np.asarray(load_frame(frame_data)) for frame_data in data.split(",")]
|
||||
), {}
|
||||
|
||||
return self.load_bytes(base64.b64decode(data))
|
||||
|
||||
def load_file(self, filepath: Path) -> tuple[npt.NDArray, dict[str, Any]]:
|
||||
with filepath.open("rb") as f:
|
||||
data = f.read()
|
||||
|
||||
return self.load_bytes(data)
|
||||
|
||||
def encode_base64(
|
||||
self,
|
||||
media: npt.NDArray,
|
||||
*,
|
||||
video_format: str = "JPEG",
|
||||
) -> str:
|
||||
video = media
|
||||
|
||||
if video_format == "JPEG":
|
||||
encode_frame = partial(
|
||||
self.image_io.encode_base64,
|
||||
image_format=video_format,
|
||||
)
|
||||
|
||||
return ",".join(encode_frame(Image.fromarray(frame)) for frame in video)
|
||||
|
||||
msg = "Only JPEG format is supported for now."
|
||||
raise NotImplementedError(msg)
|
||||
@@ -23,7 +23,6 @@ from vllm.utils.collection_utils import is_list_of
|
||||
from vllm.utils.import_utils import LazyLoader
|
||||
|
||||
from .audio import AudioResampler, AudioSpec, normalize_audio
|
||||
from .base import MediaWithBytes
|
||||
from .inputs import (
|
||||
AudioItem,
|
||||
HfAudioItem,
|
||||
@@ -36,6 +35,7 @@ from .inputs import (
|
||||
MultiModalKwargsItems,
|
||||
VideoItem,
|
||||
)
|
||||
from .media import MediaWithBytes
|
||||
|
||||
_T = TypeVar("_T")
|
||||
_I = TypeVar("_I")
|
||||
|
||||
@@ -22,10 +22,14 @@ from vllm.connections import HTTPConnection, global_http_connection
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.registry import ExtensionManager
|
||||
|
||||
from .audio import AudioEmbeddingMediaIO, AudioMediaIO
|
||||
from .base import MediaIO
|
||||
from .image import ImageEmbeddingMediaIO, ImageMediaIO
|
||||
from .video import VideoMediaIO
|
||||
from .media import (
|
||||
AudioEmbeddingMediaIO,
|
||||
AudioMediaIO,
|
||||
ImageEmbeddingMediaIO,
|
||||
ImageMediaIO,
|
||||
MediaIO,
|
||||
VideoMediaIO,
|
||||
)
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from .inputs import (
|
||||
|
||||
@@ -1,27 +1,19 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import base64
|
||||
import math
|
||||
from abc import abstractmethod
|
||||
from functools import partial
|
||||
from io import BytesIO
|
||||
from pathlib import Path
|
||||
from typing import TYPE_CHECKING, Any, cast
|
||||
|
||||
import numpy as np
|
||||
import numpy.typing as npt
|
||||
from PIL import Image
|
||||
|
||||
if TYPE_CHECKING:
|
||||
import cv2
|
||||
|
||||
from vllm import envs
|
||||
from vllm.logger import init_logger
|
||||
from vllm.utils.registry import ExtensionManager
|
||||
|
||||
from .base import MediaIO
|
||||
from .image import ImageMediaIO
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
@@ -755,76 +747,3 @@ class Molmo2VideoBackend(VideoLoader):
|
||||
**kwargs,
|
||||
)
|
||||
return out
|
||||
|
||||
|
||||
class VideoMediaIO(MediaIO[tuple[npt.NDArray, dict[str, Any]]]):
|
||||
def __init__(
|
||||
self,
|
||||
image_io: ImageMediaIO,
|
||||
num_frames: int = 32,
|
||||
**kwargs,
|
||||
) -> None:
|
||||
super().__init__()
|
||||
|
||||
self.image_io = image_io
|
||||
self.num_frames = num_frames
|
||||
# `kwargs` contains custom arguments from
|
||||
# --media-io-kwargs for this modality.
|
||||
# They can be passed to the underlying
|
||||
# media loaders (e.g. custom implementations)
|
||||
# for flexible control.
|
||||
|
||||
# Allow per-request override of video backend via kwargs.
|
||||
# This enables users to specify a different backend than the
|
||||
# global VLLM_VIDEO_LOADER_BACKEND env var, e.g.:
|
||||
# --media-io-kwargs '{"video": {"video_backend": "torchcodec"}}'
|
||||
video_loader_backend = (
|
||||
kwargs.pop("video_backend", None) or envs.VLLM_VIDEO_LOADER_BACKEND
|
||||
)
|
||||
self.kwargs = kwargs
|
||||
self.video_loader = VIDEO_LOADER_REGISTRY.load(video_loader_backend)
|
||||
|
||||
def load_bytes(self, data: bytes) -> tuple[npt.NDArray, dict[str, Any]]:
|
||||
return self.video_loader.load_bytes(
|
||||
data, num_frames=self.num_frames, **self.kwargs
|
||||
)
|
||||
|
||||
def load_base64(
|
||||
self, media_type: str, data: str
|
||||
) -> tuple[npt.NDArray, dict[str, Any]]:
|
||||
if media_type.lower() == "video/jpeg":
|
||||
load_frame = partial(
|
||||
self.image_io.load_base64,
|
||||
"image/jpeg",
|
||||
)
|
||||
|
||||
return np.stack(
|
||||
[np.asarray(load_frame(frame_data)) for frame_data in data.split(",")]
|
||||
), {}
|
||||
|
||||
return self.load_bytes(base64.b64decode(data))
|
||||
|
||||
def load_file(self, filepath: Path) -> tuple[npt.NDArray, dict[str, Any]]:
|
||||
with filepath.open("rb") as f:
|
||||
data = f.read()
|
||||
|
||||
return self.load_bytes(data)
|
||||
|
||||
def encode_base64(
|
||||
self,
|
||||
media: npt.NDArray,
|
||||
*,
|
||||
video_format: str = "JPEG",
|
||||
) -> str:
|
||||
video = media
|
||||
|
||||
if video_format == "JPEG":
|
||||
encode_frame = partial(
|
||||
self.image_io.encode_base64,
|
||||
image_format=video_format,
|
||||
)
|
||||
|
||||
return ",".join(encode_frame(Image.fromarray(frame)) for frame in video)
|
||||
|
||||
msg = "Only JPEG format is supported for now."
|
||||
raise NotImplementedError(msg)
|
||||
|
||||
Reference in New Issue
Block a user