Migrate FuyuImagePatchInputs to TensorSchema (#21662)

Signed-off-by: Benji Beck <benjibeck@meta.com>
2025-07-26 19:34:14 -07:00
parent 0b8caf9095
commit 3339cba3ff
3 changed files with 56 additions and 44 deletions
--- a/vllm/model_executor/models/fuyu.py
+++ b/vllm/model_executor/models/fuyu.py
@@ -19,7 +19,7 @@
 """ PyTorch Fuyu model."""
 import math
 from collections.abc import Iterable, Mapping, Sequence
-from typing import Literal, Optional, TypedDict
+from typing import Annotated, Literal, Optional

 import torch
 import torch.nn as nn
@@ -40,6 +40,7 @@ from vllm.multimodal.processing import (BaseMultiModalProcessor,
                                        PromptUpdate, PromptUpdateDetails)
 from vllm.multimodal.profiling import BaseDummyInputsBuilder
 from vllm.sequence import IntermediateTensors
+from vllm.utils.tensor_schema import TensorSchema, TensorShape

 from .interfaces import MultiModalEmbeddings, SupportsMultiModal, SupportsPP
 from .utils import (AutoWeightsLoader, WeightsMapper, flatten_bn, maybe_prefix,
@@ -50,18 +51,24 @@ _IMAGE_TOKEN_ID = 71011
 _NEWLINE_TOKEN_ID = 71019


-class FuyuImagePatchInputs(TypedDict):
-    type: Literal["image_patches"]
-    flat_data: torch.Tensor
+class FuyuImagePatchInputs(TensorSchema):
    """
-    Shape: 
-    `(batch_size * num_patches, patch_size_x * patch_size_y * num_channels)`
+    Dimensions:
+        - bn: Batch size * number of images
+        - fn: Num channels * patch_size_x * patch_size_y
    """

-    patches_per_image: list[int]
+    type: Literal["image_patches"] = "image_patches"
+
+    flat_data: Annotated[
+        torch.Tensor,
+        TensorShape("bn", "fn"),
+    ]
+
+    patches_per_image: Annotated[list[int], TensorShape("bn")]
    """
    The number of total patches for each image in the batch.
-
+    
    This is used to split the embeddings which has the first two dimensions
    flattened just like `flat_data`.
    """
@@ -297,42 +304,18 @@ class FuyuForCausalLM(nn.Module, SupportsMultiModal, SupportsPP):
        self.make_empty_intermediate_tensors = (
            self.language_model.make_empty_intermediate_tensors)

-    def _validate_pixel_values(self, data: torch.Tensor) -> torch.Tensor:
-
-        h = w = self.config.patch_size
-        num_channels = self.config.num_channels
-        expected_dims = num_channels * h * w
-
-        def _validate_shape(d: torch.Tensor):
-            actual_dims = d.size(-1)
-
-            if actual_dims != expected_dims:
-                expected_expr = str(expected_dims)
-                raise ValueError(
-                    "The expected shape of pixel values per image per batch "
-                    f"per patch is {expected_expr}. "
-                    f"You supplied {tuple(d.shape)}.")
-
-        for d in data:
-            _validate_shape(d)
-
-        return data.to(self.vision_embed_tokens.weight.dtype)
-
    def _parse_and_validate_image_input(
            self, **kwargs: object) -> Optional[FuyuImagePatchInputs]:
        image_patches = kwargs.pop("image_patches", None)
        if image_patches is not None:
-            if not isinstance(image_patches, (torch.Tensor, list)):
-                raise ValueError("Incorrect type of image patches. "
-                                 f"Got type: {type(image_patches)}")
-
            image_patches_flat = flatten_bn(image_patches)
-
+            flat_data = flatten_bn(image_patches, concat=True).data.to(
+                self.vision_embed_tokens.weight.dtype)
            return FuyuImagePatchInputs(
                type="image_patches",
-                flat_data=self._validate_pixel_values(
-                    flatten_bn(image_patches_flat, concat=True)),
+                flat_data=flat_data,
                patches_per_image=[x.size(0) for x in image_patches_flat],
+                resolve_bindings={"fn": self.image_feature_size},
            )

        return None