diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 02bb1f769..0a10bc1bb 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -14,7 +14,8 @@ from PIL import Image, ImageChops
 
 from vllm.multimodal.image import convert_image_mode
 from vllm.multimodal.inputs import PlaceholderRange
-from vllm.multimodal.utils import MediaConnector, argsort_mm_positions
+from vllm.multimodal.media import MediaConnector
+from vllm.multimodal.utils import argsort_mm_positions
 
 # Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
 TEST_IMAGE_ASSETS = [
diff --git a/vllm/entrypoints/chat_utils.py b/vllm/entrypoints/chat_utils.py
index 82fdb281e..5c8683cbd 100644
--- a/vllm/entrypoints/chat_utils.py
+++ b/vllm/entrypoints/chat_utils.py
@@ -50,8 +50,8 @@ from vllm.multimodal.inputs import (
     VisionChunkImage,
     VisionChunkVideo,
 )
+from vllm.multimodal.media import MEDIA_CONNECTOR_REGISTRY, MediaConnector
 from vllm.multimodal.processing import BaseMultiModalProcessor
-from vllm.multimodal.utils import MEDIA_CONNECTOR_REGISTRY, MediaConnector
 from vllm.utils import random_uuid
 from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import LazyLoader
diff --git a/vllm/model_executor/models/nemotron_parse.py b/vllm/model_executor/models/nemotron_parse.py
index afefafcc4..5069bf239 100644
--- a/vllm/model_executor/models/nemotron_parse.py
+++ b/vllm/model_executor/models/nemotron_parse.py
@@ -58,8 +58,8 @@ from vllm.multimodal.processing import (
     PromptReplacement,
     PromptUpdate,
 )
+from vllm.tokenizers import TokenizerLike
 from vllm.transformers_utils.configs.radio import RadioConfig
-from vllm.transformers_utils.tokenizer import TokenizerLike
 from vllm.utils.tensor_schema import TensorSchema, TensorShape
 from vllm.v1.attention.backend import AttentionType
 
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index cf6795204..a5da362cd 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -27,12 +27,12 @@ from vllm.utils.collection_utils import is_list_of
 from vllm.utils.import_utils import LazyLoader
 from vllm.utils.jsontree import json_map_leaves
 
+from .media import MediaWithBytes
+
 if TYPE_CHECKING:
     import torch
     import torch.types
     from transformers.feature_extraction_utils import BatchFeature
-
-    from .media import MediaWithBytes
 else:
     torch = LazyLoader("torch", globals(), "torch")
 
@@ -58,7 +58,7 @@ Represents a single audio
 item, which can be passed to a HuggingFace `AudioProcessor`.
 """
 
-ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor", "MediaWithBytes[HfImageItem]"]
+ImageItem: TypeAlias = Union[HfImageItem, "torch.Tensor", MediaWithBytes[HfImageItem]]
 """
 A `transformers.image_utils.ImageInput` representing a single image
 item, which can be passed to a HuggingFace `ImageProcessor`.
diff --git a/vllm/multimodal/media/__init__.py b/vllm/multimodal/media/__init__.py
index 02d288715..94b08c484 100644
--- a/vllm/multimodal/media/__init__.py
+++ b/vllm/multimodal/media/__init__.py
@@ -2,8 +2,9 @@
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 from .audio import AudioEmbeddingMediaIO, AudioMediaIO
 from .base import MediaIO, MediaWithBytes
+from .connector import MEDIA_CONNECTOR_REGISTRY, MediaConnector
 from .image import ImageEmbeddingMediaIO, ImageMediaIO
-from .video import VideoMediaIO
+from .video import VIDEO_LOADER_REGISTRY, VideoMediaIO
 
 __all__ = [
     "MediaIO",
@@ -12,5 +13,8 @@ __all__ = [
     "AudioMediaIO",
     "ImageEmbeddingMediaIO",
     "ImageMediaIO",
+    "VIDEO_LOADER_REGISTRY",
     "VideoMediaIO",
+    "MEDIA_CONNECTOR_REGISTRY",
+    "MediaConnector",
 ]
diff --git a/vllm/multimodal/media/connector.py b/vllm/multimodal/media/connector.py
new file mode 100644
index 000000000..37dc67aca
--- /dev/null
+++ b/vllm/multimodal/media/connector.py
@@ -0,0 +1,343 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import atexit
+from concurrent.futures import ThreadPoolExecutor
+from pathlib import Path
+from typing import Any, TypeVar
+from urllib.request import url2pathname
+
+import numpy as np
+import numpy.typing as npt
+import torch
+from PIL import Image, UnidentifiedImageError
+from urllib3.util import Url, parse_url
+
+import vllm.envs as envs
+from vllm.connections import HTTPConnection, global_http_connection
+from vllm.utils.registry import ExtensionManager
+
+from .audio import AudioEmbeddingMediaIO, AudioMediaIO
+from .base import MediaIO
+from .image import ImageEmbeddingMediaIO, ImageMediaIO
+from .video import VideoMediaIO
+
+_M = TypeVar("_M")
+
+global_thread_pool = ThreadPoolExecutor(
+    max_workers=envs.VLLM_MEDIA_LOADING_THREAD_COUNT
+)
+atexit.register(global_thread_pool.shutdown)
+
+MEDIA_CONNECTOR_REGISTRY = ExtensionManager()
+
+
+@MEDIA_CONNECTOR_REGISTRY.register("http")
+class MediaConnector:
+    def __init__(
+        self,
+        media_io_kwargs: dict[str, dict[str, Any]] | None = None,
+        connection: HTTPConnection = global_http_connection,
+        *,
+        allowed_local_media_path: str = "",
+        allowed_media_domains: list[str] | None = None,
+    ) -> None:
+        """
+        Args:
+            media_io_kwargs: Additional args passed to process media
+                             inputs, keyed by modalities. For example,
+                             to set num_frames for video, set
+                             `--media-io-kwargs '{"video":{"num_frames":40}}'`
+            connection: HTTP connection client to download media contents.
+            allowed_local_media_path: A local directory to load media files from.
+            allowed_media_domains: If set, only media URLs that belong to this
+                                   domain can be used for multi-modal inputs.
+        """
+        super().__init__()
+
+        self.media_io_kwargs: dict[str, dict[str, Any]] = (
+            media_io_kwargs if media_io_kwargs else {}
+        )
+        self.connection = connection
+
+        if allowed_local_media_path:
+            allowed_local_media_path_ = Path(allowed_local_media_path)
+
+            if not allowed_local_media_path_.exists():
+                raise ValueError(
+                    "Invalid `--allowed-local-media-path`: The path "
+                    f"{allowed_local_media_path_} does not exist."
+                )
+            if not allowed_local_media_path_.is_dir():
+                raise ValueError(
+                    "Invalid `--allowed-local-media-path`: The path "
+                    f"{allowed_local_media_path_} must be a directory."
+                )
+        else:
+            allowed_local_media_path_ = None
+
+        self.allowed_local_media_path = allowed_local_media_path_
+        if allowed_media_domains is None:
+            allowed_media_domains = []
+        self.allowed_media_domains = allowed_media_domains
+
+    def _load_data_url(
+        self,
+        url_spec: Url,
+        media_io: MediaIO[_M],
+    ) -> _M:  # type: ignore[type-var]
+        url_spec_path = url_spec.path or ""
+        data_spec, data = url_spec_path.split(",", 1)
+        media_type, data_type = data_spec.split(";", 1)
+        # media_type starts with a leading "/" (e.g., "/video/jpeg")
+        media_type = media_type.lstrip("/")
+
+        if data_type != "base64":
+            msg = "Only base64 data URLs are supported for now."
+            raise NotImplementedError(msg)
+
+        return media_io.load_base64(media_type, data)
+
+    def _load_file_url(
+        self,
+        url_spec: Url,
+        media_io: MediaIO[_M],
+    ) -> _M:  # type: ignore[type-var]
+        allowed_local_media_path = self.allowed_local_media_path
+        if allowed_local_media_path is None:
+            raise RuntimeError(
+                "Cannot load local files without `--allowed-local-media-path`."
+            )
+
+        url_spec_path = url_spec.path or ""
+        url_spec_netloc = url_spec.netloc or ""
+        filepath = Path(url2pathname(url_spec_netloc + url_spec_path))
+        if allowed_local_media_path not in filepath.resolve().parents:
+            raise ValueError(
+                f"The file path {filepath} must be a subpath "
+                f"of `--allowed-local-media-path {allowed_local_media_path}`."
+            )
+
+        return media_io.load_file(filepath)
+
+    def _assert_url_in_allowed_media_domains(self, url_spec: Url) -> None:
+        if (
+            self.allowed_media_domains
+            and url_spec.hostname not in self.allowed_media_domains
+        ):
+            raise ValueError(
+                f"The URL must be from one of the allowed domains: "
+                f"{self.allowed_media_domains}. Input URL domain: "
+                f"{url_spec.hostname}"
+            )
+
+    def load_from_url(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: int | None = None,
+    ) -> _M:  # type: ignore[type-var]
+        url_spec = parse_url(url)
+
+        if url_spec.scheme and url_spec.scheme.startswith("http"):
+            self._assert_url_in_allowed_media_domains(url_spec)
+
+            connection = self.connection
+            data = connection.get_bytes(
+                url,
+                timeout=fetch_timeout,
+                allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
+            )
+
+            return media_io.load_bytes(data)
+
+        if url_spec.scheme == "data":
+            return self._load_data_url(url_spec, media_io)
+
+        if url_spec.scheme == "file":
+            return self._load_file_url(url_spec, media_io)
+
+        msg = "The URL must be either a HTTP, data or file URL."
+        raise ValueError(msg)
+
+    async def load_from_url_async(
+        self,
+        url: str,
+        media_io: MediaIO[_M],
+        *,
+        fetch_timeout: int | None = None,
+    ) -> _M:
+        url_spec = parse_url(url)
+        loop = asyncio.get_running_loop()
+
+        if url_spec.scheme and url_spec.scheme.startswith("http"):
+            self._assert_url_in_allowed_media_domains(url_spec)
+
+            connection = self.connection
+            data = await connection.async_get_bytes(
+                url,
+                timeout=fetch_timeout,
+                allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
+            )
+            future = loop.run_in_executor(global_thread_pool, media_io.load_bytes, data)
+            return await future
+
+        if url_spec.scheme == "data":
+            future = loop.run_in_executor(
+                global_thread_pool, self._load_data_url, url_spec, media_io
+            )
+            return await future
+
+        if url_spec.scheme == "file":
+            future = loop.run_in_executor(
+                global_thread_pool, self._load_file_url, url_spec, media_io
+            )
+            return await future
+        msg = "The URL must be either a HTTP, data or file URL."
+        raise ValueError(msg)
+
+    def fetch_audio(
+        self,
+        audio_url: str,
+    ) -> tuple[np.ndarray, int | float]:
+        """
+        Load audio from a URL.
+        """
+        audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))
+
+        return self.load_from_url(
+            audio_url,
+            audio_io,
+            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+        )
+
+    async def fetch_audio_async(
+        self,
+        audio_url: str,
+    ) -> tuple[np.ndarray, int | float]:
+        """
+        Asynchronously fetch audio from a URL.
+        """
+        audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))
+
+        return await self.load_from_url_async(
+            audio_url,
+            audio_io,
+            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+        )
+
+    def fetch_image(
+        self,
+        image_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> Image.Image:
+        """
+        Load a PIL image from an HTTP or base64 data URL.
+
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(
+            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
+        )
+
+        try:
+            return self.load_from_url(
+                image_url,
+                image_io,
+                fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+            )
+        except UnidentifiedImageError as e:
+            # convert to ValueError to be properly caught upstream
+            raise ValueError(str(e)) from e
+
+    async def fetch_image_async(
+        self,
+        image_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> Image.Image:
+        """
+        Asynchronously load a PIL image from an HTTP or base64 data URL.
+
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(
+            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
+        )
+
+        try:
+            return await self.load_from_url_async(
+                image_url,
+                image_io,
+                fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
+            )
+        except UnidentifiedImageError as e:
+            # convert to ValueError to be properly caught upstream
+            raise ValueError(str(e)) from e
+
+    def fetch_video(
+        self,
+        video_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        """
+        Load video from an HTTP or base64 data URL.
+        """
+        image_io = ImageMediaIO(
+            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
+        )
+        video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {}))
+
+        return self.load_from_url(
+            video_url,
+            video_io,
+            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
+        )
+
+    async def fetch_video_async(
+        self,
+        video_url: str,
+        *,
+        image_mode: str = "RGB",
+    ) -> tuple[npt.NDArray, dict[str, Any]]:
+        """
+        Asynchronously load video from an HTTP or base64 data URL.
+
+        By default, the image is converted into RGB format.
+        """
+        image_io = ImageMediaIO(
+            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
+        )
+        video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {}))
+
+        return await self.load_from_url_async(
+            video_url,
+            video_io,
+            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
+        )
+
+    def fetch_image_embedding(
+        self,
+        data: str,
+    ) -> torch.Tensor:
+        """
+        Load image embedding from a URL.
+        """
+        image_embedding_io = ImageEmbeddingMediaIO()
+
+        return image_embedding_io.load_base64("", data)
+
+    def fetch_audio_embedding(
+        self,
+        data: str,
+    ) -> torch.Tensor:
+        """
+        Load audio embedding from a URL.
+        """
+        audio_embedding_io = AudioEmbeddingMediaIO()
+
+        return audio_embedding_io.load_base64("", data)
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index 0a38a6e2a..2f8c343ca 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -1,367 +1,50 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
 
-import asyncio
-import atexit
 import mimetypes
+import warnings
 from collections.abc import Generator
-from concurrent.futures import ThreadPoolExecutor
 from itertools import groupby
-from pathlib import Path
-from typing import TYPE_CHECKING, Any, TypeVar
-from urllib.request import url2pathname
+from typing import TYPE_CHECKING, Any
 
 import numpy as np
 import numpy.typing as npt
-import torch
-from PIL import Image, UnidentifiedImageError
-from urllib3.util import Url, parse_url
+from PIL import Image
 
-import vllm.envs as envs
-from vllm.connections import HTTPConnection, global_http_connection
 from vllm.logger import init_logger
-from vllm.utils.registry import ExtensionManager
+from vllm.utils.import_utils import LazyLoader
 
-from .media import (
-    AudioEmbeddingMediaIO,
-    AudioMediaIO,
-    ImageEmbeddingMediaIO,
-    ImageMediaIO,
-    MediaIO,
-    VideoMediaIO,
+from .inputs import (
+    BatchedTensorInputs,
+    MultiModalKwargsItem,
+    MultiModalKwargsItems,
+    MultiModalPlaceholderDict,
 )
+from .media import AudioMediaIO, ImageMediaIO, MediaConnector, VideoMediaIO
 
 if TYPE_CHECKING:
-    from .inputs import (
-        BatchedTensorInputs,
-        MultiModalKwargsItem,
-        MultiModalPlaceholderDict,
-    )
+    import torch.types
 else:
-    BatchedTensorInputs = Any
-    MultiModalKwargsItem = Any
-    MultiModalPlaceholderDict = Any
+    torch = LazyLoader("torch", globals(), "torch")
 
 logger = init_logger(__name__)
 
-global_thread_pool = ThreadPoolExecutor(
-    max_workers=envs.VLLM_MEDIA_LOADING_THREAD_COUNT
-)
-atexit.register(global_thread_pool.shutdown)
 
-_M = TypeVar("_M")
+def __getattr__(name: str):
+    if name == "MEDIA_CONNECTOR_REGISTRY":
+        from .media import MEDIA_CONNECTOR_REGISTRY
 
-MEDIA_CONNECTOR_REGISTRY = ExtensionManager()
-
-
-@MEDIA_CONNECTOR_REGISTRY.register("http")
-class MediaConnector:
-    def __init__(
-        self,
-        media_io_kwargs: dict[str, dict[str, Any]] | None = None,
-        connection: HTTPConnection = global_http_connection,
-        *,
-        allowed_local_media_path: str = "",
-        allowed_media_domains: list[str] | None = None,
-    ) -> None:
-        """
-        Args:
-            media_io_kwargs: Additional args passed to process media
-                             inputs, keyed by modalities. For example,
-                             to set num_frames for video, set
-                             `--media-io-kwargs '{"video":{"num_frames":40}}'`
-            connection: HTTP connection client to download media contents.
-            allowed_local_media_path: A local directory to load media files from.
-            allowed_media_domains: If set, only media URLs that belong to this
-                                   domain can be used for multi-modal inputs.
-        """
-        super().__init__()
-
-        self.media_io_kwargs: dict[str, dict[str, Any]] = (
-            media_io_kwargs if media_io_kwargs else {}
-        )
-        self.connection = connection
-
-        if allowed_local_media_path:
-            allowed_local_media_path_ = Path(allowed_local_media_path)
-
-            if not allowed_local_media_path_.exists():
-                raise ValueError(
-                    "Invalid `--allowed-local-media-path`: The path "
-                    f"{allowed_local_media_path_} does not exist."
-                )
-            if not allowed_local_media_path_.is_dir():
-                raise ValueError(
-                    "Invalid `--allowed-local-media-path`: The path "
-                    f"{allowed_local_media_path_} must be a directory."
-                )
-        else:
-            allowed_local_media_path_ = None
-
-        self.allowed_local_media_path = allowed_local_media_path_
-        if allowed_media_domains is None:
-            allowed_media_domains = []
-        self.allowed_media_domains = allowed_media_domains
-
-    def _load_data_url(
-        self,
-        url_spec: Url,
-        media_io: MediaIO[_M],
-    ) -> _M:  # type: ignore[type-var]
-        url_spec_path = url_spec.path or ""
-        data_spec, data = url_spec_path.split(",", 1)
-        media_type, data_type = data_spec.split(";", 1)
-        # media_type starts with a leading "/" (e.g., "/video/jpeg")
-        media_type = media_type.lstrip("/")
-
-        if data_type != "base64":
-            msg = "Only base64 data URLs are supported for now."
-            raise NotImplementedError(msg)
-
-        return media_io.load_base64(media_type, data)
-
-    def _load_file_url(
-        self,
-        url_spec: Url,
-        media_io: MediaIO[_M],
-    ) -> _M:  # type: ignore[type-var]
-        allowed_local_media_path = self.allowed_local_media_path
-        if allowed_local_media_path is None:
-            raise RuntimeError(
-                "Cannot load local files without `--allowed-local-media-path`."
-            )
-
-        url_spec_path = url_spec.path or ""
-        url_spec_netloc = url_spec.netloc or ""
-        filepath = Path(url2pathname(url_spec_netloc + url_spec_path))
-        if allowed_local_media_path not in filepath.resolve().parents:
-            raise ValueError(
-                f"The file path {filepath} must be a subpath "
-                f"of `--allowed-local-media-path {allowed_local_media_path}`."
-            )
-
-        return media_io.load_file(filepath)
-
-    def _assert_url_in_allowed_media_domains(self, url_spec: Url) -> None:
-        if (
-            self.allowed_media_domains
-            and url_spec.hostname not in self.allowed_media_domains
-        ):
-            raise ValueError(
-                f"The URL must be from one of the allowed domains: "
-                f"{self.allowed_media_domains}. Input URL domain: "
-                f"{url_spec.hostname}"
-            )
-
-    def load_from_url(
-        self,
-        url: str,
-        media_io: MediaIO[_M],
-        *,
-        fetch_timeout: int | None = None,
-    ) -> _M:  # type: ignore[type-var]
-        url_spec = parse_url(url)
-
-        if url_spec.scheme and url_spec.scheme.startswith("http"):
-            self._assert_url_in_allowed_media_domains(url_spec)
-
-            connection = self.connection
-            data = connection.get_bytes(
-                url,
-                timeout=fetch_timeout,
-                allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
-            )
-
-            return media_io.load_bytes(data)
-
-        if url_spec.scheme == "data":
-            return self._load_data_url(url_spec, media_io)
-
-        if url_spec.scheme == "file":
-            return self._load_file_url(url_spec, media_io)
-
-        msg = "The URL must be either a HTTP, data or file URL."
-        raise ValueError(msg)
-
-    async def load_from_url_async(
-        self,
-        url: str,
-        media_io: MediaIO[_M],
-        *,
-        fetch_timeout: int | None = None,
-    ) -> _M:
-        url_spec = parse_url(url)
-        loop = asyncio.get_running_loop()
-
-        if url_spec.scheme and url_spec.scheme.startswith("http"):
-            self._assert_url_in_allowed_media_domains(url_spec)
-
-            connection = self.connection
-            data = await connection.async_get_bytes(
-                url,
-                timeout=fetch_timeout,
-                allow_redirects=envs.VLLM_MEDIA_URL_ALLOW_REDIRECTS,
-            )
-            future = loop.run_in_executor(global_thread_pool, media_io.load_bytes, data)
-            return await future
-
-        if url_spec.scheme == "data":
-            future = loop.run_in_executor(
-                global_thread_pool, self._load_data_url, url_spec, media_io
-            )
-            return await future
-
-        if url_spec.scheme == "file":
-            future = loop.run_in_executor(
-                global_thread_pool, self._load_file_url, url_spec, media_io
-            )
-            return await future
-        msg = "The URL must be either a HTTP, data or file URL."
-        raise ValueError(msg)
-
-    def fetch_audio(
-        self,
-        audio_url: str,
-    ) -> tuple[np.ndarray, int | float]:
-        """
-        Load audio from a URL.
-        """
-        audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))
-
-        return self.load_from_url(
-            audio_url,
-            audio_io,
-            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
+        warnings.warn(
+            "`vllm.multimodal.utils.MEDIA_CONNECTOR_REGISTRY` "
+            "has been moved to `vllm.multimodal.media.MEDIA_CONNECTOR_REGISTRY`. "
+            "The old name will be removed in v0.17.",
+            DeprecationWarning,
+            stacklevel=2,
         )
 
-    async def fetch_audio_async(
-        self,
-        audio_url: str,
-    ) -> tuple[np.ndarray, int | float]:
-        """
-        Asynchronously fetch audio from a URL.
-        """
-        audio_io = AudioMediaIO(**self.media_io_kwargs.get("audio", {}))
+        return MEDIA_CONNECTOR_REGISTRY
 
-        return await self.load_from_url_async(
-            audio_url,
-            audio_io,
-            fetch_timeout=envs.VLLM_AUDIO_FETCH_TIMEOUT,
-        )
-
-    def fetch_image(
-        self,
-        image_url: str,
-        *,
-        image_mode: str = "RGB",
-    ) -> Image.Image:
-        """
-        Load a PIL image from an HTTP or base64 data URL.
-
-        By default, the image is converted into RGB format.
-        """
-        image_io = ImageMediaIO(
-            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
-        )
-
-        try:
-            return self.load_from_url(
-                image_url,
-                image_io,
-                fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
-            )
-        except UnidentifiedImageError as e:
-            # convert to ValueError to be properly caught upstream
-            raise ValueError(str(e)) from e
-
-    async def fetch_image_async(
-        self,
-        image_url: str,
-        *,
-        image_mode: str = "RGB",
-    ) -> Image.Image:
-        """
-        Asynchronously load a PIL image from an HTTP or base64 data URL.
-
-        By default, the image is converted into RGB format.
-        """
-        image_io = ImageMediaIO(
-            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
-        )
-
-        try:
-            return await self.load_from_url_async(
-                image_url,
-                image_io,
-                fetch_timeout=envs.VLLM_IMAGE_FETCH_TIMEOUT,
-            )
-        except UnidentifiedImageError as e:
-            # convert to ValueError to be properly caught upstream
-            raise ValueError(str(e)) from e
-
-    def fetch_video(
-        self,
-        video_url: str,
-        *,
-        image_mode: str = "RGB",
-    ) -> tuple[npt.NDArray, dict[str, Any]]:
-        """
-        Load video from an HTTP or base64 data URL.
-        """
-        image_io = ImageMediaIO(
-            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
-        )
-        video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {}))
-
-        return self.load_from_url(
-            video_url,
-            video_io,
-            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
-        )
-
-    async def fetch_video_async(
-        self,
-        video_url: str,
-        *,
-        image_mode: str = "RGB",
-    ) -> tuple[npt.NDArray, dict[str, Any]]:
-        """
-        Asynchronously load video from an HTTP or base64 data URL.
-
-        By default, the image is converted into RGB format.
-        """
-        image_io = ImageMediaIO(
-            image_mode=image_mode, **self.media_io_kwargs.get("image", {})
-        )
-        video_io = VideoMediaIO(image_io, **self.media_io_kwargs.get("video", {}))
-
-        return await self.load_from_url_async(
-            video_url,
-            video_io,
-            fetch_timeout=envs.VLLM_VIDEO_FETCH_TIMEOUT,
-        )
-
-    def fetch_image_embedding(
-        self,
-        data: str,
-    ) -> torch.Tensor:
-        """
-        Load image embedding from a URL.
-        """
-        image_embedding_io = ImageEmbeddingMediaIO()
-
-        return image_embedding_io.load_base64("", data)
-
-    def fetch_audio_embedding(
-        self,
-        data: str,
-    ) -> torch.Tensor:
-        """
-        Load audio embedding from a URL.
-        """
-        audio_embedding_io = AudioEmbeddingMediaIO()
-
-        return audio_embedding_io.load_base64("", data)
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")
 
 
 def encode_audio_base64(
@@ -483,8 +166,6 @@ def group_mm_kwargs_by_modality(
     Yields:
         A tuple `(modality, num_items, grouped_kwargs)`.
     """
-    from vllm.multimodal.inputs import MultiModalKwargsItems
-
     for modality, group in groupby(mm_kwargs, key=lambda x: x[0]):
         items_lst = [item for _, item in group]
         mm_kwargs_items = MultiModalKwargsItems({modality: items_lst})
diff --git a/vllm/transformers_utils/tokenizer.py b/vllm/transformers_utils/tokenizer.py
index 212f1dccc..36e357a83 100644
--- a/vllm/transformers_utils/tokenizer.py
+++ b/vllm/transformers_utils/tokenizer.py
@@ -17,3 +17,5 @@ def __getattr__(name: str):
         )
 
         return get_tokenizer
+
+    raise AttributeError(f"module {__name__!r} has no attribute {name!r}")