[Chore] Move MediaConnector to vllm.multimodal.media (#33324)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2026-01-30 00:54:31 +08:00
committed by GitHub
parent 5a66c9cc76
commit 831453fcef
8 changed files with 380 additions and 349 deletions

View File

@@ -14,7 +14,8 @@ from PIL import Image, ImageChops
from vllm.multimodal.image import convert_image_mode
from vllm.multimodal.inputs import PlaceholderRange
from vllm.multimodal.utils import MediaConnector, argsort_mm_positions
from vllm.multimodal.media import MediaConnector
from vllm.multimodal.utils import argsort_mm_positions
# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
TEST_IMAGE_ASSETS = [

View File

@@ -50,8 +50,8 @@ from vllm.multimodal.inputs import (
VisionChunkImage,
VisionChunkVideo,
)
from vllm.multimodal.media import MEDIA_CONNECTOR_REGISTRY, MediaConnector
from vllm.multimodal.processing import BaseMultiModalProcessor
from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
from vllm.utils import random_uuid
from vllm.utils.collection_utils import is_list_of
from vllm.utils.import_utils import LazyLoader

View File

@@ -58,8 +58,8 @@ from vllm.multimodal.processing import (
PromptReplacement,
PromptUpdate,
)
from vllm.tokenizers import TokenizerLike
from vllm.transformers_utils.configs.radio import RadioConfig
from vllm.transformers_utils.tokenizer import TokenizerLike
from vllm.utils.tensor_schema import TensorSchema, TensorShape
from vllm.v1.attention.backend import AttentionType

View File

@@ -27,12 +27,12 @@ from vllm.utils.collection_utils import is_list_of
from vllm.utils.import_utils import LazyLoader
from vllm.utils.jsontree import json_map_leaves
from .media import MediaWithBytes
if TYPE_CHECKING:
import torch
import torch.types
from transformers.feature_extraction_utils import BatchFeature
from .media import MediaWithBytes
else:
torch = LazyLoader("torch", globals(), "torch")
@@ -58,7 +58,7 @@ Represents a single audio
item, which can be passed to a HuggingFace `AudioProcessor`.
"""
ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor", "MediaWithBytes[HfImageItem]"]
ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor", MediaWithBytes[HfImageItem]]
"""
A `transformers.image_utils.ImageInput` representing a single image
item, which can be passed to a HuggingFace `ImageProcessor`.

View File

@@ -2,8 +2,9 @@
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from .audio import AudioEmbeddingMediaIO, AudioMediaIO
from .base import MediaIO, MediaWithBytes
from .connector import MEDIA_CONNECTOR_REGISTRY, MediaConnector
from .image import ImageEmbeddingMediaIO, ImageMediaIO
from .video import VideoMediaIO
from .video import VIDEO_LOADER_REGISTRY, VideoMediaIO
__all__ = [
"MediaIO",
@@ -12,5 +13,8 @@ __all__ = [
"AudioMediaIO",
"ImageEmbeddingMediaIO",
"ImageMediaIO",
"VIDEO_LOADER_REGISTRY",
"VideoMediaIO",
"MEDIA_CONNECTOR_REGISTRY",
"MediaConnector",
]

View File

@@ -0,0 +1,343 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import atexit
from concurrent.futures import ThreadPoolExecutor
from pathlib import Path
from typing import Any, TypeVar
from urllib.request import url2pathname
import numpy as np
import numpy.typing as npt
import torch
from PIL import Image, UnidentifiedImageError
from urllib3.util import Url, parse_url
import vllm.envs as envs
from vllm.connections import HTTPConnection, global_http_connection
from vllm.utils.registry import ExtensionManager
from .audio import AudioEmbeddingMediaIO, AudioMediaIO
from .base import MediaIO
from .image import ImageEmbeddingMediaIO, ImageMediaIO
from .video import VideoMediaIO
_M = TypeVar("_M")
global_thread_pool = ThreadPoolExecutor(
max_workers=envs.VLLM_MEDIA_LOADING_THREAD_COUNT
)
atexit.register(global_thread_pool.shutdown)
MEDIA_CONNECTOR_REGISTRY = ExtensionManager()
@MEDIA_CONNECTOR_REGISTRY.register("http")
class MediaConnector:
def __init__(
self,
media_io_kwargs: dict[str, dict[str, Any]] | None = None,
connection: HTTPConnection = global_http_connection,
*,
allowed_local_media_path: str = "",
allowed_media_domains: list[str] | None = None,
) -> None:
"""
Args:
media_io_kwargs: Additional args passed to process media
inputs, keyed by modalities. For example,
to set num_frames for video, set
`--media-io-kwargs '{"video":{"num_frames":40}}'`
connection: HTTP connection client to download media contents.
allowed_local_media_path: A local directory to load media files from.
allowed_media_domains: If set, only media URLs that belong to this
domain can be used for multi-modal inputs.
"""
super().__init__()
self.media_io_kwargs: dict[str, dict[str, Any]] = (
media_io_kwargs if media_io_kwargs else {}
)
self.connection = connection
if allowed_local_media_path:
allowed_local_media_path_ = Path(allowed_local_media_path)
if not allowed_local_media_path_.exists():
raise ValueError(
"Invalid `--allowed-local-media-path`: The path "
f"{allowed_local_media_path_} does not exist."
)
if not allowed_local_media_path_.is_dir():
raise ValueError(
"Invalid `--allowed-local-media-path`: The path "
f"{allowed_local_media_path_} must be a directory."
)
else:
allowed_local_media_path_ = None
self.allowed_local_media_path = allowed_local_media_path_
if allowed_media_domains is None:
allowed_media_domains = []
self.allowed_media_domains = allowed_media_domains
def _load_data_url(
self,
url_spec: Url,
media_io: MediaIO[_M],
) -> _M: # type: ignore[type-var]
url_spec_path = url_spec.path or ""
data_spec, data = url_spec_path.split(",", 1)
media_type, data_type = data_spec.split(";", 1)
# media_type starts with a leading "/" (e.g., "/video/jpeg")
media_type = media_type.lstrip("/")
if data_type != "base64":
msg = "Only base64 data URLs are supported for now."
raise NotImplementedError(msg)
return media_io.load_base64(media_type, data)
def _load_file_url(
self,
url_spec: Url,
media_io: MediaIO[_M],
) -> _M: # type: ignore[type-var]
allowed_local_media_path = self.allowed_local_media_path
if allowed_local_media_path is None:
raise RuntimeError(
"Cannot load local files without `--allowed-local-media-path`."
)
url_spec_path = url_spec.path or ""
url_spec_netloc = url_spec.netloc or ""
filepath = Path(url2pathname(url_spec_netloc + url_spec_path))
if allowed_local_media_path not in filepath.resolve().parents:
raise ValueError(
f"The file path {filepath} must be a subpath "
f"of `--allowed-local-media-path {allowed_local_media_path}`."
)
return media_io.load_file(filepath)
def _assert_url_in_allowed_media_domains(self, url_spec: Url) -> None:
if (
self.allowed_media_domains
and url_spec.hostname not in self.allowed_media_domains
):
raise ValueError(
f"The URL must be from one of the allowed domains: "
f"{self.allowed_media_domains}. Input URL domain: "
f"{url_spec.hostname}"
)
def load_from_url(
self,
url: str,
media_io: MediaIO[_M],
*,
fetch_timeout: int | None = None,
) -> _M: # type: ignore[type-var]
url_spec = parse_url(url)
if url_spec.scheme and url_spec.scheme.startswith("http"):
self._assert_url_in_allowed_media_domains(url_spec)
connection = self.connection
data = connection.get_bytes(
url,
timeout=fetch_timeout,
allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
)
return media_io.load_bytes(data)
if url_spec.scheme == "data":
return self._load_data_url(url_spec, media_io)
if url_spec.scheme == "file":
return self._load_file_url(url_spec, media_io)
msg = "The URL must be either a HTTP, data or file URL."
raise ValueError(msg)
async def load_from_url_async(
self,
url: str,
media_io: MediaIO[_M],
*,
fetch_timeout: int | None = None,
) -> _M:
url_spec = parse_url(url)
loop = asyncio.get_running_loop()
if url_spec.scheme and url_spec.scheme.startswith("http"):
self._assert_url_in_allowed_media_domains(url_spec)
connection = self.connection
data = await connection.async_get_bytes(
url,
timeout=fetch_timeout,
allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
)
future = loop.run_in_executor(global_thread_pool, media_io.load_bytes, data)
return await future
if url_spec.scheme == "data":
future = loop.run_in_executor(
global_thread_pool, self._load_data_url, url_spec, media_io
)
return await future
if url_spec.scheme == "file":
future = loop.run_in_executor(
global_thread_pool, self._load_file_url, url_spec, media_io
)
return await future
msg = "The URL must be either a HTTP, data or file URL."
raise ValueError(msg)
def fetch_audio(
self,
audio_url: str,
) -> tuple[np.ndarray, int | float]:
"""
Load audio from a URL.
"""
audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))
return self.load_from_url(
audio_url,
audio_io,
fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
)
async def fetch_audio_async(
self,
audio_url: str,
) -> tuple[np.ndarray, int | float]:
"""
Asynchronously fetch audio from a URL.
"""
audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))
return await self.load_from_url_async(
audio_url,
audio_io,
fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
)
def fetch_image(
self,
image_url: str,
*,
image_mode: str = "RGB",
) -> Image.Image:
"""
Load a PIL image from an HTTP or base64 data URL.
By default, the image is converted into RGB format.
"""
image_io = ImageMediaIO(
image_mode=image_mode, **self.media_io_kwargs.get("image", {})
)
try:
return self.load_from_url(
image_url,
image_io,
fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
)
except UnidentifiedImageError as e:
# convert to ValueError to be properly caught upstream
raise ValueError(str(e)) from e
async def fetch_image_async(
self,
image_url: str,
*,
image_mode: str = "RGB",
) -> Image.Image:
"""
Asynchronously load a PIL image from an HTTP or base64 data URL.
By default, the image is converted into RGB format.
"""
image_io = ImageMediaIO(
image_mode=image_mode, **self.media_io_kwargs.get("image", {})
)
try:
return await self.load_from_url_async(
image_url,
image_io,
fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
)
except UnidentifiedImageError as e:
# convert to ValueError to be properly caught upstream
raise ValueError(str(e)) from e
def fetch_video(
self,
video_url: str,
*,
image_mode: str = "RGB",
) -> tuple[npt.NDArray, dict[str, Any]]:
"""
Load video from an HTTP or base64 data URL.
"""
image_io = ImageMediaIO(
image_mode=image_mode, **self.media_io_kwargs.get("image", {})
)
video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {}))
return self.load_from_url(
video_url,
video_io,
fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
)
async def fetch_video_async(
self,
video_url: str,
*,
image_mode: str = "RGB",
) -> tuple[npt.NDArray, dict[str, Any]]:
"""
Asynchronously load video from an HTTP or base64 data URL.
By default, the image is converted into RGB format.
"""
image_io = ImageMediaIO(
image_mode=image_mode, **self.media_io_kwargs.get("image", {})
)
video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {}))
return await self.load_from_url_async(
video_url,
video_io,
fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
)
def fetch_image_embedding(
self,
data: str,
) -> torch.Tensor:
"""
Load image embedding from a URL.
"""
image_embedding_io = ImageEmbeddingMediaIO()
return image_embedding_io.load_base64("", data)
def fetch_audio_embedding(
self,
data: str,
) -> torch.Tensor:
"""
Load audio embedding from a URL.
"""
audio_embedding_io = AudioEmbeddingMediaIO()
return audio_embedding_io.load_base64("", data)

View File

@@ -1,367 +1,50 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import asyncio
import atexit
import mimetypes
import warnings
from collections.abc import Generator
from concurrent.futures import ThreadPoolExecutor
from itertools import groupby
from pathlib import Path
from typing import TYPE_CHECKING, Any, TypeVar
from urllib.request import url2pathname
from typing import TYPE_CHECKING, Any
import numpy as np
import numpy.typing as npt
import torch
from PIL import Image, UnidentifiedImageError
from urllib3.util import Url, parse_url
from PIL import Image
import vllm.envs as envs
from vllm.connections import HTTPConnection, global_http_connection
from vllm.logger import init_logger
from vllm.utils.registry import ExtensionManager
from vllm.utils.import_utils import LazyLoader
from .media import (
AudioEmbeddingMediaIO,
AudioMediaIO,
ImageEmbeddingMediaIO,
ImageMediaIO,
MediaIO,
VideoMediaIO,
from .inputs import (
BatchedTensorInputs,
MultiModalKwargsItem,
MultiModalKwargsItems,
MultiModalPlaceholderDict,
)
from .media import AudioMediaIO, ImageMediaIO, MediaConnector, VideoMediaIO
if TYPE_CHECKING:
from .inputs import (
BatchedTensorInputs,
MultiModalKwargsItem,
MultiModalPlaceholderDict,
)
import torch.types
else:
BatchedTensorInputs = Any
MultiModalKwargsItem = Any
MultiModalPlaceholderDict = Any
torch = LazyLoader("torch", globals(), "torch")
logger = init_logger(__name__)
global_thread_pool = ThreadPoolExecutor(
max_workers=envs.VLLM_MEDIA_LOADING_THREAD_COUNT
)
atexit.register(global_thread_pool.shutdown)
_M = TypeVar("_M")
def __getattr__(name: str):
if name == "MEDIA_CONNECTOR_REGISTRY":
from .media import MEDIA_CONNECTOR_REGISTRY
MEDIA_CONNECTOR_REGISTRY = ExtensionManager()
@MEDIA_CONNECTOR_REGISTRY.register("http")
class MediaConnector:
def __init__(
self,
media_io_kwargs: dict[str, dict[str, Any]] | None = None,
connection: HTTPConnection = global_http_connection,
*,
allowed_local_media_path: str = "",
allowed_media_domains: list[str] | None = None,
) -> None:
"""
Args:
media_io_kwargs: Additional args passed to process media
inputs, keyed by modalities. For example,
to set num_frames for video, set
`--media-io-kwargs '{"video":{"num_frames":40}}'`
connection: HTTP connection client to download media contents.
allowed_local_media_path: A local directory to load media files from.
allowed_media_domains: If set, only media URLs that belong to this
domain can be used for multi-modal inputs.
"""
super().__init__()
self.media_io_kwargs: dict[str, dict[str, Any]] = (
media_io_kwargs if media_io_kwargs else {}
)
self.connection = connection
if allowed_local_media_path:
allowed_local_media_path_ = Path(allowed_local_media_path)
if not allowed_local_media_path_.exists():
raise ValueError(
"Invalid `--allowed-local-media-path`: The path "
f"{allowed_local_media_path_} does not exist."
)
if not allowed_local_media_path_.is_dir():
raise ValueError(
"Invalid `--allowed-local-media-path`: The path "
f"{allowed_local_media_path_} must be a directory."
)
else:
allowed_local_media_path_ = None
self.allowed_local_media_path = allowed_local_media_path_
if allowed_media_domains is None:
allowed_media_domains = []
self.allowed_media_domains = allowed_media_domains
def _load_data_url(
self,
url_spec: Url,
media_io: MediaIO[_M],
) -> _M: # type: ignore[type-var]
url_spec_path = url_spec.path or ""
data_spec, data = url_spec_path.split(",", 1)
media_type, data_type = data_spec.split(";", 1)
# media_type starts with a leading "/" (e.g., "/video/jpeg")
media_type = media_type.lstrip("/")
if data_type != "base64":
msg = "Only base64 data URLs are supported for now."
raise NotImplementedError(msg)
return media_io.load_base64(media_type, data)
def _load_file_url(
self,
url_spec: Url,
media_io: MediaIO[_M],
) -> _M: # type: ignore[type-var]
allowed_local_media_path = self.allowed_local_media_path
if allowed_local_media_path is None:
raise RuntimeError(
"Cannot load local files without `--allowed-local-media-path`."
)
url_spec_path = url_spec.path or ""
url_spec_netloc = url_spec.netloc or ""
filepath = Path(url2pathname(url_spec_netloc + url_spec_path))
if allowed_local_media_path not in filepath.resolve().parents:
raise ValueError(
f"The file path {filepath} must be a subpath "
f"of `--allowed-local-media-path {allowed_local_media_path}`."
)
return media_io.load_file(filepath)
def _assert_url_in_allowed_media_domains(self, url_spec: Url) -> None:
if (
self.allowed_media_domains
and url_spec.hostname not in self.allowed_media_domains
):
raise ValueError(
f"The URL must be from one of the allowed domains: "
f"{self.allowed_media_domains}. Input URL domain: "
f"{url_spec.hostname}"
)
def load_from_url(
self,
url: str,
media_io: MediaIO[_M],
*,
fetch_timeout: int | None = None,
) -> _M: # type: ignore[type-var]
url_spec = parse_url(url)
if url_spec.scheme and url_spec.scheme.startswith("http"):
self._assert_url_in_allowed_media_domains(url_spec)
connection = self.connection
data = connection.get_bytes(
url,
timeout=fetch_timeout,
allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
)
return media_io.load_bytes(data)
if url_spec.scheme == "data":
return self._load_data_url(url_spec, media_io)
if url_spec.scheme == "file":
return self._load_file_url(url_spec, media_io)
msg = "The URL must be either a HTTP, data or file URL."
raise ValueError(msg)
async def load_from_url_async(
self,
url: str,
media_io: MediaIO[_M],
*,
fetch_timeout: int | None = None,
) -> _M:
url_spec = parse_url(url)
loop = asyncio.get_running_loop()
if url_spec.scheme and url_spec.scheme.startswith("http"):
self._assert_url_in_allowed_media_domains(url_spec)
connection = self.connection
data = await connection.async_get_bytes(
url,
timeout=fetch_timeout,
allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
)
future = loop.run_in_executor(global_thread_pool, media_io.load_bytes, data)
return await future
if url_spec.scheme == "data":
future = loop.run_in_executor(
global_thread_pool, self._load_data_url, url_spec, media_io
)
return await future
if url_spec.scheme == "file":
future = loop.run_in_executor(
global_thread_pool, self._load_file_url, url_spec, media_io
)
return await future
msg = "The URL must be either a HTTP, data or file URL."
raise ValueError(msg)
def fetch_audio(
self,
audio_url: str,
) -> tuple[np.ndarray, int | float]:
"""
Load audio from a URL.
"""
audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))
return self.load_from_url(
audio_url,
audio_io,
fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
warnings.warn(
"`vllm.multimodal.utils.MEDIA_CONNECTOR_REGISTRY` "
"has been moved to `vllm.multimodal.media.MEDIA_CONNECTOR_REGISTRY`. "
"The old name will be removed in v0.17.",
DeprecationWarning,
stacklevel=2,
)
async def fetch_audio_async(
self,
audio_url: str,
) -> tuple[np.ndarray, int | float]:
"""
Asynchronously fetch audio from a URL.
"""
audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))
return MEDIA_CONNECTOR_REGISTRY
return await self.load_from_url_async(
audio_url,
audio_io,
fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
)
def fetch_image(
self,
image_url: str,
*,
image_mode: str = "RGB",
) -> Image.Image:
"""
Load a PIL image from an HTTP or base64 data URL.
By default, the image is converted into RGB format.
"""
image_io = ImageMediaIO(
image_mode=image_mode, **self.media_io_kwargs.get("image", {})
)
try:
return self.load_from_url(
image_url,
image_io,
fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
)
except UnidentifiedImageError as e:
# convert to ValueError to be properly caught upstream
raise ValueError(str(e)) from e
async def fetch_image_async(
self,
image_url: str,
*,
image_mode: str = "RGB",
) -> Image.Image:
"""
Asynchronously load a PIL image from an HTTP or base64 data URL.
By default, the image is converted into RGB format.
"""
image_io = ImageMediaIO(
image_mode=image_mode, **self.media_io_kwargs.get("image", {})
)
try:
return await self.load_from_url_async(
image_url,
image_io,
fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
)
except UnidentifiedImageError as e:
# convert to ValueError to be properly caught upstream
raise ValueError(str(e)) from e
def fetch_video(
self,
video_url: str,
*,
image_mode: str = "RGB",
) -> tuple[npt.NDArray, dict[str, Any]]:
"""
Load video from an HTTP or base64 data URL.
"""
image_io = ImageMediaIO(
image_mode=image_mode, **self.media_io_kwargs.get("image", {})
)
video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {}))
return self.load_from_url(
video_url,
video_io,
fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
)
async def fetch_video_async(
self,
video_url: str,
*,
image_mode: str = "RGB",
) -> tuple[npt.NDArray, dict[str, Any]]:
"""
Asynchronously load video from an HTTP or base64 data URL.
By default, the image is converted into RGB format.
"""
image_io = ImageMediaIO(
image_mode=image_mode, **self.media_io_kwargs.get("image", {})
)
video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {}))
return await self.load_from_url_async(
video_url,
video_io,
fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
)
def fetch_image_embedding(
self,
data: str,
) -> torch.Tensor:
"""
Load image embedding from a URL.
"""
image_embedding_io = ImageEmbeddingMediaIO()
return image_embedding_io.load_base64("", data)
def fetch_audio_embedding(
self,
data: str,
) -> torch.Tensor:
"""
Load audio embedding from a URL.
"""
audio_embedding_io = AudioEmbeddingMediaIO()
return audio_embedding_io.load_base64("", data)
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
def encode_audio_base64(
@@ -483,8 +166,6 @@ def group_mm_kwargs_by_modality(
Yields:
A tuple `(modality, num_items, grouped_kwargs)`.
"""
from vllm.multimodal.inputs import MultiModalKwargsItems
for modality, group in groupby(mm_kwargs, key=lambda x: x[0]):
items_lst = [item for _, item in group]
mm_kwargs_items = MultiModalKwargsItems({modality: items_lst})

View File

@@ -17,3 +17,5 @@ def __getattr__(name: str):
)
return get_tokenizer
raise AttributeError(f"module {__name__!r} has no attribute {name!r}")