diff --git a/.buildkite/test-amd.yaml b/.buildkite/test-amd.yaml
index ca3bebcb0..9c3e84af9 100644
--- a/.buildkite/test-amd.yaml
+++ b/.buildkite/test-amd.yaml
@@ -863,10 +863,11 @@ steps:
   torch_nightly: true
   source_file_dependencies:
   - vllm/
+  - tests/models/test_terratorch.py
   - tests/models/test_transformers.py
   - tests/models/test_registry.py
   commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
+    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
 
 - label: Basic Models Test (Other CPU) # 5min
   mirror_hardwares: [amdexperimental, amdproduction]
diff --git a/.buildkite/test-pipeline.yaml b/.buildkite/test-pipeline.yaml
index b03e4b6d8..e3146948b 100644
--- a/.buildkite/test-pipeline.yaml
+++ b/.buildkite/test-pipeline.yaml
@@ -804,10 +804,11 @@ steps:
   torch_nightly: true
   source_file_dependencies:
   - vllm/
+  - tests/models/test_terratorch.py
   - tests/models/test_transformers.py
   - tests/models/test_registry.py
   commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
+    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
 
 - label: Basic Models Test (Other CPU) # 5min
   timeout_in_minutes: 10
diff --git a/.buildkite/test_areas/models_basic.yaml b/.buildkite/test_areas/models_basic.yaml
index aa6161ffa..df0a98dc9 100644
--- a/.buildkite/test_areas/models_basic.yaml
+++ b/.buildkite/test_areas/models_basic.yaml
@@ -33,10 +33,11 @@ steps:
   timeout_in_minutes: 45
   source_file_dependencies:
   - vllm/
+  - tests/models/test_terratorch.py
   - tests/models/test_transformers.py
   - tests/models/test_registry.py
   commands:
-    - pytest -v -s models/test_transformers.py models/test_registry.py
+    - pytest -v -s models/test_terratorch.py models/test_transformers.py models/test_registry.py
 
 - label: Basic Models Test (Other CPU) # 5min
   depends_on: 
diff --git a/tests/models/test_terratorch.py b/tests/models/test_terratorch.py
index 24b624e26..5de154fa3 100644
--- a/tests/models/test_terratorch.py
+++ b/tests/models/test_terratorch.py
@@ -5,8 +5,10 @@ import pytest
 import torch
 
 from tests.conftest import VllmRunner
+from tests.utils import create_new_process_for_each_test
 
 
+@create_new_process_for_each_test()  # Memory is not cleaned up properly otherwise
 @pytest.mark.parametrize(
     "model",
     [
@@ -22,10 +24,14 @@ def test_inference(
     location_coords = torch.full((1, 2), 1.0, dtype=torch.float16)
     prompt = dict(
         prompt_token_ids=[1],
-        multi_modal_data=dict(
-            pixel_values=pixel_values, location_coords=location_coords
-        ),
+        multi_modal_data={
+            "image": {
+                "pixel_values": pixel_values,
+                "location_coords": location_coords,
+            }
+        },
     )
+
     with vllm_runner(
         model,
         runner="pooling",
diff --git a/tests/multimodal/media/test_connector.py b/tests/multimodal/media/test_connector.py
new file mode 100644
index 000000000..6ef71fcc0
--- /dev/null
+++ b/tests/multimodal/media/test_connector.py
@@ -0,0 +1,320 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+
+import asyncio
+import base64
+import mimetypes
+import os
+from tempfile import NamedTemporaryFile, TemporaryDirectory
+
+import numpy as np
+import pytest
+import torch
+from PIL import Image, ImageChops
+
+from vllm.multimodal.image import convert_image_mode
+from vllm.multimodal.inputs import PlaceholderRange
+from vllm.multimodal.media import MediaConnector
+
+# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
+TEST_IMAGE_ASSETS = [
+    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
+    "Grayscale_8bits_palette_sample_image.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
+    "1280px-Venn_diagram_rgb.svg.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
+    "RGBA_comp.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
+]
+
+TEST_VIDEO_URLS = [
+    "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4",
+    "https://github.com/opencv/opencv/raw/refs/tags/4.12.0/samples/data/vtest.avi",
+]
+
+
+@pytest.fixture(scope="module")
+def url_images(local_asset_server) -> dict[str, Image.Image]:
+    return {
+        image_url: local_asset_server.get_image_asset(image_url)
+        for image_url in TEST_IMAGE_ASSETS
+    }
+
+
+def get_supported_suffixes() -> tuple[str, ...]:
+    # We should at least test the file types mentioned in GPT-4 with Vision
+    OPENAI_SUPPORTED_SUFFIXES = (".png", ".jpeg", ".jpg", ".webp", ".gif")
+
+    # Additional file types that are supported by us
+    EXTRA_SUPPORTED_SUFFIXES = (".bmp", ".tiff")
+
+    return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES
+
+
+def _image_equals(a: Image.Image, b: Image.Image) -> bool:
+    return (np.asarray(a) == np.asarray(convert_image_mode(b, a.mode))).all()
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
+async def test_fetch_image_http(image_url: str):
+    connector = MediaConnector()
+
+    image_sync = connector.fetch_image(image_url)
+    image_async = await connector.fetch_image_async(image_url)
+    assert _image_equals(image_sync, image_async)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
+@pytest.mark.parametrize("suffix", get_supported_suffixes())
+async def test_fetch_image_base64(
+    url_images: dict[str, Image.Image], raw_image_url: str, suffix: str
+):
+    connector = MediaConnector(
+        # Domain restriction should not apply to data URLs.
+        allowed_media_domains=[
+            "www.bogotobogo.com",
+            "github.com",
+        ]
+    )
+    url_image = url_images[raw_image_url]
+
+    try:
+        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
+    except KeyError:
+        try:
+            mime_type = mimetypes.types_map[suffix]
+        except KeyError:
+            pytest.skip("No MIME type")
+
+    with NamedTemporaryFile(suffix=suffix) as f:
+        try:
+            url_image.save(f.name)
+        except Exception as e:
+            if e.args[0] == "cannot write mode RGBA as JPEG":
+                pytest.skip("Conversion not supported")
+
+            raise
+
+        base64_image = base64.b64encode(f.read()).decode("utf-8")
+        data_url = f"data:{mime_type};base64,{base64_image}"
+
+        data_image_sync = connector.fetch_image(data_url)
+        if _image_equals(url_image, Image.open(f)):
+            assert _image_equals(url_image, data_image_sync)
+        else:
+            pass  # Lossy format; only check that image can be opened
+
+        data_image_async = await connector.fetch_image_async(data_url)
+        assert _image_equals(data_image_sync, data_image_async)
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
+async def test_fetch_image_local_files(image_url: str):
+    connector = MediaConnector()
+
+    with TemporaryDirectory() as temp_dir:
+        local_connector = MediaConnector(allowed_local_media_path=temp_dir)
+
+        origin_image = connector.fetch_image(image_url)
+        origin_image.save(
+            os.path.join(temp_dir, os.path.basename(image_url)),
+            quality=100,
+            icc_profile=origin_image.info.get("icc_profile"),
+        )
+
+        image_async = await local_connector.fetch_image_async(
+            f"file://{temp_dir}/{os.path.basename(image_url)}"
+        )
+        image_sync = local_connector.fetch_image(
+            f"file://{temp_dir}/{os.path.basename(image_url)}"
+        )
+        # Check that the images are equal
+        assert not ImageChops.difference(image_sync, image_async).getbbox()
+
+        with pytest.raises(ValueError, match="must be a subpath"):
+            await local_connector.fetch_image_async(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}"
+            )
+        with pytest.raises(RuntimeError, match="Cannot load local files"):
+            await connector.fetch_image_async(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}"
+            )
+
+        with pytest.raises(ValueError, match="must be a subpath"):
+            local_connector.fetch_image(
+                f"file://{temp_dir}/../{os.path.basename(image_url)}"
+            )
+        with pytest.raises(RuntimeError, match="Cannot load local files"):
+            connector.fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}")
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("image_url", [TEST_IMAGE_ASSETS[0]], indirect=True)
+async def test_fetch_image_local_files_with_space_in_name(image_url: str):
+    connector = MediaConnector()
+
+    with TemporaryDirectory() as temp_dir:
+        local_connector = MediaConnector(allowed_local_media_path=temp_dir)
+
+        origin_image = connector.fetch_image(image_url)
+        filename = "file name with space.jpg"
+        origin_image.save(
+            os.path.join(temp_dir, filename),
+            quality=100,
+            icc_profile=origin_image.info.get("icc_profile"),
+        )
+
+        try:
+            image_async = await local_connector.fetch_image_async(
+                f"file://{temp_dir}/{filename}"
+            )
+            image_sync = local_connector.fetch_image(f"file://{temp_dir}/{filename}")
+        except FileNotFoundError as e:
+            pytest.fail("Failed to fetch image with space in name: {}".format(e))
+        # Check that the images are equal
+        assert not ImageChops.difference(image_sync, image_async).getbbox()
+
+
+@pytest.mark.asyncio
+async def test_fetch_image_error_conversion():
+    connector = MediaConnector()
+    broken_img = "data:image/png;base64,aGVsbG9fdmxsbV9jb21tdW5pdHkK"
+
+    # PIL.UnidentifiedImageError should be converted to ValueError
+    with pytest.raises(ValueError):
+        await connector.fetch_image_async(broken_img)
+
+    with pytest.raises(ValueError):
+        connector.fetch_image(broken_img)
+
+
+@pytest.mark.flaky(reruns=3, reruns_delay=5)
+@pytest.mark.asyncio
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
+async def test_fetch_video_http(video_url: str, num_frames: int):
+    connector = MediaConnector(
+        media_io_kwargs={
+            "video": {
+                "num_frames": num_frames,
+            }
+        }
+    )
+
+    try:
+        video_sync, metadata_sync = connector.fetch_video(video_url)
+        video_async, metadata_async = await connector.fetch_video_async(video_url)
+    except (TimeoutError, asyncio.TimeoutError) as e:
+        pytest.skip(f"Timeout fetching video (CI network flakiness): {e}")
+
+    assert np.array_equal(video_sync, video_async)
+    assert metadata_sync == metadata_async
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+@pytest.mark.parametrize("max_duration", [1, 60, 1800])
+@pytest.mark.parametrize("requested_fps", [2, 24])
+async def test_fetch_video_http_with_dynamic_loader(
+    video_url: str,
+    max_duration: int,
+    requested_fps: int,
+    monkeypatch: pytest.MonkeyPatch,
+):
+    with monkeypatch.context() as m:
+        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
+        connector = MediaConnector(
+            media_io_kwargs={
+                "video": {
+                    "max_duration": max_duration,
+                    "requested_fps": requested_fps,
+                }
+            }
+        )
+
+        video_sync, metadata_sync = connector.fetch_video(video_url)
+        video_async, metadata_async = await connector.fetch_video_async(video_url)
+
+        assert np.array_equal(video_sync, video_async)
+        assert metadata_sync == metadata_async
+        assert metadata_sync["video_backend"] == "opencv_dynamic"
+
+
+@pytest.mark.parametrize(
+    "is_embed,start_idx,end_idx,expected",
+    [
+        (None, 2, 4, (2, 4)),
+        (
+            torch.tensor([False, True, False, True, True]),
+            3,
+            5,
+            (1, 3),
+        ),
+        (
+            torch.tensor([False, True, False, True, True]),
+            0,
+            2,
+            (0, 1),
+        ),
+        (
+            torch.tensor([True, False, True, False]),
+            2,
+            2,
+            (1, 1),
+        ),
+    ],
+)
+def test_placeholder_range_get_embeds_indices_in_range(
+    is_embed, start_idx, end_idx, expected
+):
+    length = len(is_embed) if is_embed is not None else 5
+    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
+    assert pr.get_embeds_indices_in_range(start_idx, end_idx) == expected
+
+
+@pytest.mark.parametrize(
+    "offset,is_embed,expected",
+    [
+        (0, None, [(0, 4)]),
+        (
+            2,
+            torch.tensor([False, True, False, True, True]),
+            [(3, 3), (5, 6)],
+        ),
+        (0, torch.tensor([True, True, True, True]), [(0, 3)]),
+        (0, torch.tensor([False, False, False, False]), []),
+    ],
+)
+def test_placeholder_range_extract_embeds_range(offset, is_embed, expected):
+    length = len(is_embed) if is_embed is not None else 5
+    pr = PlaceholderRange(offset=offset, length=length, is_embed=is_embed)
+    assert pr.extract_embeds_range() == expected
+
+
+@pytest.mark.asyncio
+@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
+@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
+async def test_allowed_media_domains(video_url: str, num_frames: int):
+    connector = MediaConnector(
+        media_io_kwargs={
+            "video": {
+                "num_frames": num_frames,
+            }
+        },
+        allowed_media_domains=[
+            "www.bogotobogo.com",
+            "github.com",
+        ],
+    )
+
+    video_sync, metadata_sync = connector.fetch_video(video_url)
+    video_async, metadata_async = await connector.fetch_video_async(video_url)
+    assert np.array_equal(video_sync, video_async)
+    assert metadata_sync == metadata_async
+
+    disallowed_url = "https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png"
+    with pytest.raises(ValueError):
+        _, _ = connector.fetch_video(disallowed_url)
+
+    with pytest.raises(ValueError):
+        _, _ = await connector.fetch_video_async(disallowed_url)
diff --git a/tests/multimodal/test_hasher.py b/tests/multimodal/test_hasher.py
index 29064f273..fdedcaea2 100644
--- a/tests/multimodal/test_hasher.py
+++ b/tests/multimodal/test_hasher.py
@@ -16,6 +16,22 @@ ASSETS_DIR = Path(__file__).parent / "assets"
 assert ASSETS_DIR.exists()
 
 
+def test_hash_single_item_different_shape():
+    x1 = torch.zeros(())
+    x2 = torch.zeros((1,))
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(x=x1) != hasher.hash_kwargs(x=x2)
+
+
+def test_hash_key_order_invariant():
+    x = torch.zeros((5, 10))
+    y = torch.ones((5, 10))
+
+    hasher = MultiModalHasher
+    assert hasher.hash_kwargs(x=x, y=y) == hasher.hash_kwargs(y=y, x=x)
+
+
 # NOTE: Images that are the same visually are allowed to have the same hash
 @pytest.mark.parametrize("mode_pair", [("1", "L"), ("RGBA", "CMYK")])
 def test_hash_collision_image_mode(mode_pair):
diff --git a/tests/multimodal/test_inputs.py b/tests/multimodal/test_inputs.py
new file mode 100644
index 000000000..7378c1493
--- /dev/null
+++ b/tests/multimodal/test_inputs.py
@@ -0,0 +1,46 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+import pytest
+import torch
+
+from vllm.multimodal.inputs import PlaceholderRange
+
+
+@pytest.mark.parametrize(
+    "is_embed,expected",
+    [
+        (None, 5),
+        (torch.tensor([True, True, True, True, True]), 5),
+        (torch.tensor([False, False, False, False, False]), 0),
+        (torch.tensor([True, False, True, False, True]), 3),
+        (torch.tensor([True]), 1),
+    ],
+)
+def test_placeholder_range_get_num_embeds(is_embed, expected):
+    length = len(is_embed) if is_embed is not None else 5
+    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
+    assert pr.get_num_embeds == expected
+
+
+@pytest.mark.parametrize(
+    "is_embed,expected",
+    [
+        (None, None),
+        (
+            torch.tensor([False, True, False, True, True]),
+            torch.tensor([0, 1, 1, 2, 3]),
+        ),
+        (torch.tensor([True, True, True]), torch.tensor([1, 2, 3])),
+    ],
+)
+def test_placeholder_range_embeds_cumsum(is_embed, expected):
+    length = len(is_embed) if is_embed is not None else 5
+    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
+
+    if expected is None:
+        assert pr.embeds_cumsum is None
+        return
+
+    assert torch.equal(pr.embeds_cumsum, expected)
+    # cached_property should return the same object on repeated access
+    assert pr.embeds_cumsum is pr.embeds_cumsum
diff --git a/tests/multimodal/test_utils.py b/tests/multimodal/test_utils.py
index 0a10bc1bb..4e765ab1b 100644
--- a/tests/multimodal/test_utils.py
+++ b/tests/multimodal/test_utils.py
@@ -1,244 +1,16 @@
 # SPDX-License-Identifier: Apache-2.0
 # SPDX-FileCopyrightText: Copyright contributors to the vLLM project
-
-import asyncio
-import base64
-import mimetypes
-import os
-from tempfile import NamedTemporaryFile, TemporaryDirectory
-
-import numpy as np
 import pytest
 import torch
-from PIL import Image, ImageChops
 
-from vllm.multimodal.image import convert_image_mode
-from vllm.multimodal.inputs import PlaceholderRange
-from vllm.multimodal.media import MediaConnector
-from vllm.multimodal.utils import argsort_mm_positions
-
-# Test different image extensions (JPG/PNG) and formats (gray/RGB/RGBA)
-TEST_IMAGE_ASSETS = [
-    "2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/2560px-Gfp-wisconsin-madison-the-nature-boardwalk.jpg"
-    "Grayscale_8bits_palette_sample_image.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/Grayscale_8bits_palette_sample_image.png",
-    "1280px-Venn_diagram_rgb.svg.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/1280px-Venn_diagram_rgb.svg.png",
-    "RGBA_comp.png",  # "https://vllm-public-assets.s3.us-west-2.amazonaws.com/vision_model_images/RGBA_comp.png",
-]
-
-TEST_VIDEO_URLS = [
-    "https://www.bogotobogo.com/python/OpenCV_Python/images/mean_shift_tracking/slow_traffic_small.mp4",
-    "https://github.com/opencv/opencv/raw/refs/tags/4.12.0/samples/data/vtest.avi",
-]
-
-
-@pytest.fixture(scope="module")
-def url_images(local_asset_server) -> dict[str, Image.Image]:
-    return {
-        image_url: local_asset_server.get_image_asset(image_url)
-        for image_url in TEST_IMAGE_ASSETS
-    }
-
-
-def get_supported_suffixes() -> tuple[str, ...]:
-    # We should at least test the file types mentioned in GPT-4 with Vision
-    OPENAI_SUPPORTED_SUFFIXES = (".png", ".jpeg", ".jpg", ".webp", ".gif")
-
-    # Additional file types that are supported by us
-    EXTRA_SUPPORTED_SUFFIXES = (".bmp", ".tiff")
-
-    return OPENAI_SUPPORTED_SUFFIXES + EXTRA_SUPPORTED_SUFFIXES
-
-
-def _image_equals(a: Image.Image, b: Image.Image) -> bool:
-    return (np.asarray(a) == np.asarray(convert_image_mode(b, a.mode))).all()
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
-async def test_fetch_image_http(image_url: str):
-    connector = MediaConnector()
-
-    image_sync = connector.fetch_image(image_url)
-    image_async = await connector.fetch_image_async(image_url)
-    assert _image_equals(image_sync, image_async)
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("raw_image_url", TEST_IMAGE_ASSETS)
-@pytest.mark.parametrize("suffix", get_supported_suffixes())
-async def test_fetch_image_base64(
-    url_images: dict[str, Image.Image], raw_image_url: str, suffix: str
-):
-    connector = MediaConnector(
-        # Domain restriction should not apply to data URLs.
-        allowed_media_domains=[
-            "www.bogotobogo.com",
-            "github.com",
-        ]
-    )
-    url_image = url_images[raw_image_url]
-
-    try:
-        mime_type = Image.MIME[Image.registered_extensions()[suffix]]
-    except KeyError:
-        try:
-            mime_type = mimetypes.types_map[suffix]
-        except KeyError:
-            pytest.skip("No MIME type")
-
-    with NamedTemporaryFile(suffix=suffix) as f:
-        try:
-            url_image.save(f.name)
-        except Exception as e:
-            if e.args[0] == "cannot write mode RGBA as JPEG":
-                pytest.skip("Conversion not supported")
-
-            raise
-
-        base64_image = base64.b64encode(f.read()).decode("utf-8")
-        data_url = f"data:{mime_type};base64,{base64_image}"
-
-        data_image_sync = connector.fetch_image(data_url)
-        if _image_equals(url_image, Image.open(f)):
-            assert _image_equals(url_image, data_image_sync)
-        else:
-            pass  # Lossy format; only check that image can be opened
-
-        data_image_async = await connector.fetch_image_async(data_url)
-        assert _image_equals(data_image_sync, data_image_async)
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("image_url", TEST_IMAGE_ASSETS, indirect=True)
-async def test_fetch_image_local_files(image_url: str):
-    connector = MediaConnector()
-
-    with TemporaryDirectory() as temp_dir:
-        local_connector = MediaConnector(allowed_local_media_path=temp_dir)
-
-        origin_image = connector.fetch_image(image_url)
-        origin_image.save(
-            os.path.join(temp_dir, os.path.basename(image_url)),
-            quality=100,
-            icc_profile=origin_image.info.get("icc_profile"),
-        )
-
-        image_async = await local_connector.fetch_image_async(
-            f"file://{temp_dir}/{os.path.basename(image_url)}"
-        )
-        image_sync = local_connector.fetch_image(
-            f"file://{temp_dir}/{os.path.basename(image_url)}"
-        )
-        # Check that the images are equal
-        assert not ImageChops.difference(image_sync, image_async).getbbox()
-
-        with pytest.raises(ValueError, match="must be a subpath"):
-            await local_connector.fetch_image_async(
-                f"file://{temp_dir}/../{os.path.basename(image_url)}"
-            )
-        with pytest.raises(RuntimeError, match="Cannot load local files"):
-            await connector.fetch_image_async(
-                f"file://{temp_dir}/../{os.path.basename(image_url)}"
-            )
-
-        with pytest.raises(ValueError, match="must be a subpath"):
-            local_connector.fetch_image(
-                f"file://{temp_dir}/../{os.path.basename(image_url)}"
-            )
-        with pytest.raises(RuntimeError, match="Cannot load local files"):
-            connector.fetch_image(f"file://{temp_dir}/../{os.path.basename(image_url)}")
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("image_url", [TEST_IMAGE_ASSETS[0]], indirect=True)
-async def test_fetch_image_local_files_with_space_in_name(image_url: str):
-    connector = MediaConnector()
-
-    with TemporaryDirectory() as temp_dir:
-        local_connector = MediaConnector(allowed_local_media_path=temp_dir)
-
-        origin_image = connector.fetch_image(image_url)
-        filename = "file name with space.jpg"
-        origin_image.save(
-            os.path.join(temp_dir, filename),
-            quality=100,
-            icc_profile=origin_image.info.get("icc_profile"),
-        )
-
-        try:
-            image_async = await local_connector.fetch_image_async(
-                f"file://{temp_dir}/{filename}"
-            )
-            image_sync = local_connector.fetch_image(f"file://{temp_dir}/{filename}")
-        except FileNotFoundError as e:
-            pytest.fail("Failed to fetch image with space in name: {}".format(e))
-        # Check that the images are equal
-        assert not ImageChops.difference(image_sync, image_async).getbbox()
-
-
-@pytest.mark.asyncio
-async def test_fetch_image_error_conversion():
-    connector = MediaConnector()
-    broken_img = "data:image/png;base64,aGVsbG9fdmxsbV9jb21tdW5pdHkK"
-
-    # PIL.UnidentifiedImageError should be converted to ValueError
-    with pytest.raises(ValueError):
-        await connector.fetch_image_async(broken_img)
-
-    with pytest.raises(ValueError):
-        connector.fetch_image(broken_img)
-
-
-@pytest.mark.flaky(reruns=3, reruns_delay=5)
-@pytest.mark.asyncio
-@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
-@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
-async def test_fetch_video_http(video_url: str, num_frames: int):
-    connector = MediaConnector(
-        media_io_kwargs={
-            "video": {
-                "num_frames": num_frames,
-            }
-        }
-    )
-
-    try:
-        video_sync, metadata_sync = connector.fetch_video(video_url)
-        video_async, metadata_async = await connector.fetch_video_async(video_url)
-    except (TimeoutError, asyncio.TimeoutError) as e:
-        pytest.skip(f"Timeout fetching video (CI network flakiness): {e}")
-
-    assert np.array_equal(video_sync, video_async)
-    assert metadata_sync == metadata_async
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
-@pytest.mark.parametrize("max_duration", [1, 60, 1800])
-@pytest.mark.parametrize("requested_fps", [2, 24])
-async def test_fetch_video_http_with_dynamic_loader(
-    video_url: str,
-    max_duration: int,
-    requested_fps: int,
-    monkeypatch: pytest.MonkeyPatch,
-):
-    with monkeypatch.context() as m:
-        m.setenv("VLLM_VIDEO_LOADER_BACKEND", "opencv_dynamic")
-        connector = MediaConnector(
-            media_io_kwargs={
-                "video": {
-                    "max_duration": max_duration,
-                    "requested_fps": requested_fps,
-                }
-            }
-        )
-
-        video_sync, metadata_sync = connector.fetch_video(video_url)
-        video_async, metadata_async = await connector.fetch_video_async(video_url)
-
-        assert np.array_equal(video_sync, video_async)
-        assert metadata_sync == metadata_async
-        assert metadata_sync["video_backend"] == "opencv_dynamic"
+from vllm.multimodal.inputs import (
+    MultiModalBatchedField,
+    MultiModalFieldElem,
+    MultiModalKwargsItem,
+    MultiModalSharedField,
+    PlaceholderRange,
+)
+from vllm.multimodal.utils import argsort_mm_positions, group_and_batch_mm_items
 
 
 @pytest.mark.parametrize(
@@ -412,121 +184,35 @@ def test_argsort_mm_positions(case):
     assert modality_idxs == expected_modality_idxs
 
 
-@pytest.mark.parametrize(
-    "is_embed,expected",
-    [
-        (None, 5),
-        (torch.tensor([True, True, True, True, True]), 5),
-        (torch.tensor([False, False, False, False, False]), 0),
-        (torch.tensor([True, False, True, False, True]), 3),
-        (torch.tensor([True]), 1),
-    ],
-)
-def test_placeholder_range_get_num_embeds(is_embed, expected):
-    length = len(is_embed) if is_embed is not None else 5
-    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
-    assert pr.get_num_embeds == expected
-
-
-@pytest.mark.parametrize(
-    "is_embed,expected",
-    [
-        (None, None),
-        (
-            torch.tensor([False, True, False, True, True]),
-            torch.tensor([0, 1, 1, 2, 3]),
-        ),
-        (torch.tensor([True, True, True]), torch.tensor([1, 2, 3])),
-    ],
-)
-def test_placeholder_range_embeds_cumsum(is_embed, expected):
-    length = len(is_embed) if is_embed is not None else 5
-    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
-
-    if expected is None:
-        assert pr.embeds_cumsum is None
-        return
-
-    assert torch.equal(pr.embeds_cumsum, expected)
-    # cached_property should return the same object on repeated access
-    assert pr.embeds_cumsum is pr.embeds_cumsum
-
-
-@pytest.mark.parametrize(
-    "is_embed,start_idx,end_idx,expected",
-    [
-        (None, 2, 4, (2, 4)),
-        (
-            torch.tensor([False, True, False, True, True]),
-            3,
-            5,
-            (1, 3),
-        ),
-        (
-            torch.tensor([False, True, False, True, True]),
-            0,
-            2,
-            (0, 1),
-        ),
-        (
-            torch.tensor([True, False, True, False]),
-            2,
-            2,
-            (1, 1),
-        ),
-    ],
-)
-def test_placeholder_range_get_embeds_indices_in_range(
-    is_embed, start_idx, end_idx, expected
-):
-    length = len(is_embed) if is_embed is not None else 5
-    pr = PlaceholderRange(offset=0, length=length, is_embed=is_embed)
-    assert pr.get_embeds_indices_in_range(start_idx, end_idx) == expected
-
-
-@pytest.mark.parametrize(
-    "offset,is_embed,expected",
-    [
-        (0, None, [(0, 4)]),
-        (
-            2,
-            torch.tensor([False, True, False, True, True]),
-            [(3, 3), (5, 6)],
-        ),
-        (0, torch.tensor([True, True, True, True]), [(0, 3)]),
-        (0, torch.tensor([False, False, False, False]), []),
-    ],
-)
-def test_placeholder_range_extract_embeds_range(offset, is_embed, expected):
-    length = len(is_embed) if is_embed is not None else 5
-    pr = PlaceholderRange(offset=offset, length=length, is_embed=is_embed)
-    assert pr.extract_embeds_range() == expected
-
-
-@pytest.mark.asyncio
-@pytest.mark.parametrize("video_url", TEST_VIDEO_URLS)
-@pytest.mark.parametrize("num_frames", [-1, 32, 1800])
-async def test_allowed_media_domains(video_url: str, num_frames: int):
-    connector = MediaConnector(
-        media_io_kwargs={
-            "video": {
-                "num_frames": num_frames,
-            }
-        },
-        allowed_media_domains=[
-            "www.bogotobogo.com",
-            "github.com",
-        ],
+def test_group_and_batch_mm_items_split_by_fieldset():
+    elem = MultiModalFieldElem(
+        data=torch.empty(1, dtype=torch.uint8),
+        field=MultiModalBatchedField(),
     )
+    item1 = MultiModalKwargsItem({"x": elem, "y": elem})
+    item2 = MultiModalKwargsItem({"y": elem, "x": elem})
+    item3 = MultiModalKwargsItem({"x": elem, "y": elem, "z": elem})
+    item4 = MultiModalKwargsItem({"x": elem})
+    item5 = MultiModalKwargsItem({"x": elem, "y": elem})
 
-    video_sync, metadata_sync = connector.fetch_video(video_url)
-    video_async, metadata_async = await connector.fetch_video_async(video_url)
-    assert np.array_equal(video_sync, video_async)
-    assert metadata_sync == metadata_async
+    res = group_and_batch_mm_items([item1, item2, item3, item4, item5])
+    assert [num_items for num_items, _ in res] == [2, 1, 1, 1]
 
-    disallowed_url = "https://upload.wikimedia.org/wikipedia/commons/4/47/PNG_transparency_demonstration_1.png"
-    with pytest.raises(ValueError):
-        _, _ = connector.fetch_video(disallowed_url)
 
-    with pytest.raises(ValueError):
-        _, _ = await connector.fetch_video_async(disallowed_url)
+def test_group_and_batch_mm_items_split_by_shared_data():
+    elem1 = MultiModalFieldElem(
+        data=torch.zeros(1, dtype=torch.uint8),
+        field=MultiModalSharedField(batch_size=1),
+    )
+    elem2 = MultiModalFieldElem(
+        data=torch.zeros(2, dtype=torch.uint8),
+        field=MultiModalSharedField(batch_size=1),
+    )
+    item1 = MultiModalKwargsItem({"x": elem1})
+    item2 = MultiModalKwargsItem({"x": elem1})
+    item3 = MultiModalKwargsItem({"x": elem2})
+    item4 = MultiModalKwargsItem({"x": elem1})
+    item5 = MultiModalKwargsItem({"x": elem2})
+
+    res = group_and_batch_mm_items([item1, item2, item3, item4, item5])
+    assert [num_items for num_items, _ in res] == [2, 1, 1, 1]
diff --git a/vllm/model_executor/models/step3_vl.py b/vllm/model_executor/models/step3_vl.py
index f3993348b..11081b040 100644
--- a/vllm/model_executor/models/step3_vl.py
+++ b/vllm/model_executor/models/step3_vl.py
@@ -71,9 +71,7 @@ class Step3VLImagePixelInputs(TensorSchema):
 
     type: Literal["pixel_values"]
     pixel_values: Annotated[torch.Tensor, TensorShape("bn", 3, "h", "w")]
-    patch_pixel_values: Annotated[
-        torch.Tensor | None, TensorShape("bnp", 3, "hp", "wp")
-    ]
+    patch_pixel_values: Annotated[torch.Tensor, TensorShape("bnp", 3, "hp", "wp")]
     num_patches: Annotated[torch.Tensor, TensorShape("bn")]
 
 
@@ -91,7 +89,7 @@ class Step3VLImageEmbeddingInputs(TensorSchema):
 
 Step3VLImageInputs: TypeAlias = Step3VLImagePixelInputs | Step3VLImageEmbeddingInputs
 
-ImageWithPatches = tuple[Image.Image, list[Image.Image], list[int] | None]
+ImageWithPatches = tuple[Image.Image, list[Image.Image], list[bool] | None]
 
 MAX_IMAGE_SIZE: int = 3024
 
@@ -432,7 +430,7 @@ class Step3VLProcessor:
 
         if len(parts) - 1 != len(repls):
             raise ValueError(
-                "The number of placeholders does not match the number of replacements."  # noqa: E501
+                "The number of placeholders does not match the number of replacements."
             )
 
         result = [parts[0]]
@@ -468,7 +466,7 @@ class Step3VLProcessor:
             image_repl_str_lst = []
             image_repl_ids_lst = []
             num_patches = []
-            for raw_img, img_patches, patch_newline_mask in splitted_images_data:  # noqa: E501
+            for raw_img, img_patches, patch_newline_mask in splitted_images_data:
                 pixel_values_lst.extend(self._convert_images_to_pixel_values([raw_img]))
 
                 if len(img_patches) > 0:
@@ -486,16 +484,20 @@ class Step3VLProcessor:
                 if patch_newline_mask is not None:
                     patch_newline_mask_lst.extend(patch_newline_mask)
 
+            pixel_values = torch.cat(pixel_values_lst)
+            patch_size = self.patch_size
             image_inputs = {
-                "pixel_values": torch.cat(pixel_values_lst),
+                "pixel_values": pixel_values,
                 "num_patches": num_patches,
-            }
-            if patch_pixel_values_lst:
-                image_inputs["patch_pixel_values"] = torch.cat(patch_pixel_values_lst)
-            if patch_newline_mask_lst:
-                image_inputs["patch_newline_mask"] = torch.tensor(
+                "patch_pixel_values": (
+                    torch.cat(patch_pixel_values_lst)
+                    if patch_pixel_values_lst
+                    else pixel_values.new_empty((0, 3, patch_size, patch_size))
+                ),
+                "patch_newline_mask": torch.tensor(
                     patch_newline_mask_lst, dtype=torch.bool
-                )
+                ),
+            }
 
             text = [
                 self.replace_placeholder(t, self.image_token, image_repl_str_lst)
@@ -998,13 +1000,11 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
         if pixel_values is None and image_embeds is None:
             return None
 
-        if pixel_values is not None:
+        if pixel_values is not None and patch_pixel_values is not None:
             return Step3VLImagePixelInputs(
                 type="pixel_values",
                 pixel_values=pixel_values.to(self.dtype),
-                patch_pixel_values=patch_pixel_values.to(self.dtype)
-                if patch_pixel_values is not None
-                else None,
+                patch_pixel_values=patch_pixel_values.to(self.dtype),
                 num_patches=num_patches,
             )
 
@@ -1039,7 +1039,7 @@ class Step3VLForConditionalGeneration(nn.Module, SupportsMultiModal, SupportsPP)
             image_features = self._get_vision_model_output(image_input["pixel_values"])
             patch_image_features = (
                 self._get_vision_model_output(image_input["patch_pixel_values"])
-                if image_input["patch_pixel_values"] is not None
+                if len(image_input["patch_pixel_values"]) > 0
                 else None
             )
             num_patches = image_input["num_patches"]
diff --git a/vllm/model_executor/models/terratorch.py b/vllm/model_executor/models/terratorch.py
index a4fc3a10b..b817383ab 100644
--- a/vllm/model_executor/models/terratorch.py
+++ b/vllm/model_executor/models/terratorch.py
@@ -62,7 +62,6 @@ from vllm.multimodal.processing import (
     PromptUpdate,
 )
 from vllm.sequence import IntermediateTensors
-from vllm.utils import length_from_prompt_token_ids_or_embeds
 
 from .interfaces import IsAttentionFree, MultiModalEmbeddings, SupportsMultiModal
 from .interfaces_base import attn_type
@@ -74,7 +73,11 @@ def _terratorch_field_names(input_definition: InputDefinition):
     return set(input_definition.data.keys())
 
 
-def _terratorch_field_factory(input_definition: InputDefinition):
+def _terratorch_field_factory(
+    input_definition: InputDefinition,
+    *,
+    is_shared: bool = True,  # True for unprocessed data, False for processed data
+):
     def _terratorch_field_config(
         hf_inputs: Mapping[str, torch.Tensor],
     ) -> Mapping[str, MultiModalFieldConfig]:
@@ -82,7 +85,11 @@ def _terratorch_field_factory(input_definition: InputDefinition):
         for name, input in input_definition.data.items():
             modality = "image"
             if input.type == InputTypeEnum.tensor:
-                fields[name] = MultiModalFieldConfig.shared(modality, batch_size=1)
+                fields[name] = (
+                    MultiModalFieldConfig.shared(modality, batch_size=1)
+                    if is_shared
+                    else MultiModalFieldConfig.batched(modality)
+                )
 
         return fields
 
@@ -166,8 +173,14 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor[TerratorchProcessing
         self,
         hf_inputs: BatchFeature,
         hf_processor_mm_kwargs: Mapping[str, object],
+        *,
+        is_shared: bool = True,
     ) -> Mapping[str, MultiModalFieldConfig]:
-        return _terratorch_field_factory(self.info.input_definition)(hf_inputs)
+        factory = _terratorch_field_factory(
+            self.info.input_definition,
+            is_shared=is_shared,
+        )
+        return factory(hf_inputs)
 
     def _get_prompt_updates(
         self,
@@ -193,12 +206,19 @@ class TerratorchMultiModalProcessor(BaseMultiModalProcessor[TerratorchProcessing
         )
 
         _, passthrough_data = self._get_hf_mm_data(mm_items)
-        mm_processed_data = BatchFeature(dict(passthrough_data), tensor_type="pt")
+        mm_processed_data = BatchFeature(
+            {k: torch.tensor(v).unsqueeze(0) for k, v in passthrough_data.items()},
+            tensor_type="pt",
+        )
         mm_placeholders = {"image": [PlaceholderRange(offset=0, length=0)]}
 
         mm_kwargs = MultiModalKwargsItems.from_hf_inputs(
             mm_processed_data,
-            self._get_mm_fields_config(mm_processed_data, hf_processor_mm_kwargs),
+            self._get_mm_fields_config(
+                mm_processed_data,
+                hf_processor_mm_kwargs,
+                is_shared=False,
+            ),
         )
 
         return MultiModalInputs(
@@ -235,9 +255,6 @@ class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
         self.inference_runner = InferenceRunner(config)
         self.model = self.inference_runner.model
 
-        pooler_config = vllm_config.model_config.pooler_config
-        assert pooler_config is not None
-
         self.pooler = IdentityPooler()
 
     def embed_input_ids(
@@ -262,15 +279,8 @@ class Terratorch(nn.Module, IsAttentionFree, SupportsMultiModal):
         inputs_embeds: torch.Tensor | None = None,
         **kwargs: object,
     ):
-        input_len = length_from_prompt_token_ids_or_embeds(input_ids, inputs_embeds)
-
-        batched_kwargs = {k: v.unsqueeze(0) for k, v in kwargs.items()}
-        model_output = self.inference_runner.forward(**batched_kwargs).output
-
-        # The leading dimension of hidden states needs to equal input length
-        return model_output.expand(
-            input_len, *(-1 for _ in range(model_output.ndim - 1))
-        )
+        model_output = self.inference_runner.forward(**kwargs)
+        return model_output.output
 
     def load_weights(self, weights: Iterable[tuple[str, torch.Tensor]]) -> set[str]:
         params_list = []
diff --git a/vllm/multimodal/hasher.py b/vllm/multimodal/hasher.py
index 009d0bc44..6caf9c114 100644
--- a/vllm/multimodal/hasher.py
+++ b/vllm/multimodal/hasher.py
@@ -102,12 +102,19 @@ class MultiModalHasher:
                         "data": tensor_obj.numpy(),
                     },
                 )
+
             return cls.iter_item_to_bytes("tensor", tensor_obj.numpy())
+
         if isinstance(obj, np.ndarray):
-            # If the array is non-contiguous, we need to copy it first
-            arr_data = (
-                obj.view(np.uint8).data if obj.flags.c_contiguous else obj.tobytes()
-            )
+            if obj.ndim == 0:
+                arr_data = obj.item()
+            elif obj.flags.c_contiguous:
+                # Not valid for 0-D arrays
+                arr_data = obj.view(np.uint8).data
+            else:
+                # If the array is non-contiguous, we need to copy it first
+                arr_data = obj.tobytes()
+
             return cls.iter_item_to_bytes(
                 "ndarray",
                 {
@@ -116,6 +123,7 @@ class MultiModalHasher:
                     "data": arr_data,
                 },
             )
+
         logger.warning(
             "No serialization method found for %s. Falling back to pickle.", type(obj)
         )
@@ -147,7 +155,7 @@ class MultiModalHasher:
         hasher_factory = _get_hasher_factory(envs.VLLM_MM_HASHER_ALGORITHM)
         hasher = hasher_factory()
 
-        for k, v in kwargs.items():
+        for k, v in sorted(kwargs.items(), key=lambda kv: kv[0]):
             for bytes_ in cls.iter_item_to_bytes(k, v):
                 hasher.update(bytes_)
 
diff --git a/vllm/multimodal/inputs.py b/vllm/multimodal/inputs.py
index 262def712..2cc7900eb 100644
--- a/vllm/multimodal/inputs.py
+++ b/vllm/multimodal/inputs.py
@@ -424,8 +424,9 @@ class BaseMultiModalField(ABC):
 
     keep_on_cpu: bool = False
     """
-    If `True`, then this field is excluded from being moved to the accelerator
-    when `MultiModalKwargsItems.get_data()` is called to batch the data.
+    If `True`, then this field is excluded from being moved to the accelerator when
+    [`group_and_batch_mm_items`][vllm.multimodal.utils.group_and_batch_mm_items]
+    is called to batch the data.
     """
 
     def _field_factory(self):
@@ -1006,27 +1007,38 @@ class MultiModalKwargsItems(UserDict[str, Sequence[_I]]):
         pin_memory: bool = False,
     ) -> BatchedTensorInputs:
         """Construct a dictionary of keyword arguments to pass to the model."""
-        elems_by_key = defaultdict[str, list[MultiModalFieldElem]](list)
-        for modality, items in self.items():
-            for i, item in enumerate(items):
-                if item is None:
-                    raise RuntimeError(
-                        f"Cannot build data from empty mm_items[{modality}][{i}]"
-                    )
+        from .utils import group_and_batch_mm_items
 
-                for key, elem in item.items():
-                    elems_by_key[key].append(elem)
-
-        data = {
-            key: elems[0].field.reduce_data(
-                elems,
-                device=device,
-                pin_memory=pin_memory,
-            )
-            for key, elems in elems_by_key.items()
+        items_by_modality = self.require_data()
+        batches_by_modality = {
+            modality: [
+                data
+                for _, data in group_and_batch_mm_items(
+                    items,
+                    device=device,
+                    pin_memory=pin_memory,
+                )
+            ]
+            for modality, items in items_by_modality.items()
+            if len(items) > 0
         }
 
-        return data
+        out_data: BatchedTensorInputs = {}
+        for _, batches in batches_by_modality.items():
+            if len(batches) != 1:
+                num_batches_by_modality = {
+                    modality: len(batches)
+                    for modality, batches in batches_by_modality.items()
+                }
+
+                raise RuntimeError(
+                    f"Some modalities cannot be merged into a single batch "
+                    f"({num_batches_by_modality=})"
+                )
+
+            out_data.update(batches[0])
+
+        return out_data
 
 
 MultiModalKwargsOptionalItems: TypeAlias = (
diff --git a/vllm/multimodal/utils.py b/vllm/multimodal/utils.py
index cd116b9b8..d94faa675 100644
--- a/vllm/multimodal/utils.py
+++ b/vllm/multimodal/utils.py
@@ -3,7 +3,8 @@
 
 import mimetypes
 import warnings
-from collections.abc import Generator
+from collections import defaultdict
+from collections.abc import Generator, Sequence
 from itertools import groupby
 from typing import TYPE_CHECKING, Any
 
@@ -13,11 +14,13 @@ from PIL import Image
 
 from vllm.utils.import_utils import LazyLoader
 
+from .hasher import MultiModalHasher
 from .inputs import (
     BatchedTensorInputs,
+    MultiModalFieldElem,
     MultiModalKwargsItem,
-    MultiModalKwargsItems,
     MultiModalPlaceholderDict,
+    MultiModalSharedField,
 )
 from .media import AudioMediaIO, ImageMediaIO, MediaConnector, VideoMediaIO
 
@@ -146,32 +149,119 @@ def argsort_mm_positions(
     return [(modality, idx) for modality, idx, _ in sorted_flat_items]
 
 
+def _get_group_hash(elem: MultiModalFieldElem):
+    if not isinstance(elem.field, MultiModalSharedField):
+        return None
+
+    return MultiModalHasher.hash_kwargs(data=elem.data)
+
+
+def _batch_mm_items(
+    items: Sequence[MultiModalKwargsItem],
+    *,
+    device: torch.types.Device = None,
+    pin_memory: bool = False,
+):
+    elems = defaultdict[str, list[MultiModalFieldElem]](list)
+    for item in items:
+        for key, elem in item.items():
+            elems[key].append(elem)
+
+    return {
+        key: elems[0].field.reduce_data(
+            elems,
+            device=device,
+            pin_memory=pin_memory,
+        )
+        for key, elems in elems.items()
+    }
+
+
+def group_and_batch_mm_items(
+    items: Sequence[MultiModalKwargsItem],
+    *,
+    device: torch.types.Device = None,
+    pin_memory: bool = False,
+) -> Generator[tuple[int, BatchedTensorInputs]]:
+    """
+    Group consecutive items (possibly from different requests) into batches.
+
+    Items must be split across groups if any of the following occurs,
+    as the batch would otherwise be invalid:
+    - They have different fields (e.g. mixed image and embedding inputs).
+    - They have different values in `MultiModalSharedField`.
+
+    Args:
+        items: List of `MultiModalKwargsItem`.
+        device: The device to place the grouped tensors on.
+        pin_memory: Whether to pin memory for faster host-to-device transfer.
+
+    Yields:
+        A tuple `(num_items, grouped_kwargs)`, where:
+        - `kwargs` is a dictionary of keyword arguments to pass to the model;
+        - `num_items` is the corresponding number of items.
+    """
+    group_ids = [
+        tuple(
+            (key, _get_group_hash(elem))
+            for key, elem in sorted(item.items(), key=lambda kv: kv[0])
+        )
+        for item in items
+    ]
+    group_sizes = [sum(1 for _ in group) for _, group in groupby(group_ids)]
+
+    start_idx = 0
+    for group_size in group_sizes:
+        group_data = _batch_mm_items(
+            items[start_idx : start_idx + group_size],
+            device=device,
+            pin_memory=pin_memory,
+        )
+
+        yield group_size, group_data
+
+        start_idx += group_size
+
+    assert start_idx == len(items)
+
+
 def group_mm_kwargs_by_modality(
     mm_kwargs: list[tuple[str, MultiModalKwargsItem]],
     *,
     device: torch.types.Device = None,
     pin_memory: bool = False,
 ) -> Generator[tuple[str, int, BatchedTensorInputs], None, None]:
-    """Group consecutive `MultiModalKwargsItem`s from `mm_kwargs` with the same
-    modality together into the same `MultiModalKwargs` instance.
+    """
+    Group consecutive items (possibly from different requests) into batches.
+
+    Items must be split across groups if any of the following occurs,
+    as the batch would otherwise be invalid:
+    - They have different fields (e.g. mixed image and embedding inputs).
+    - They have different values in `MultiModalSharedField`.
+
+    To simplify the implementation of `embed_multimodal`, we add another
+    restriction that the items in a batch must belong to the same modality.
 
     Args:
-        mm_kwargs: List of `MultiModalKwargsItem`.
+        mm_kwargs: List of `(modality, item)`.
         device: The device to place the grouped tensors on.
         pin_memory: Whether to pin memory for faster host-to-device transfer.
 
     Yields:
-        A tuple `(modality, num_items, grouped_kwargs)`.
+        A tuple `(modality, num_items, grouped_kwargs)`, where:
+        - `modality` is the modality of the batch;
+        - `kwargs` is a dictionary of keyword arguments to pass to the model;
+        - `num_items` is the corresponding number of items.
     """
     for modality, group in groupby(mm_kwargs, key=lambda x: x[0]):
         items_lst = [item for _, item in group]
-        mm_kwargs_items = MultiModalKwargsItems({modality: items_lst})
-        mm_kwargs_data = mm_kwargs_items.get_data(
+
+        for num_items, mm_kwargs_batch in group_and_batch_mm_items(
+            items_lst,
             device=device,
             pin_memory=pin_memory,
-        )
-
-        yield modality, len(items_lst), mm_kwargs_data
+        ):
+            yield modality, num_items, mm_kwargs_batch
 
 
 def fetch_audio(