[VLM] Remove image_input_type from VLM config (#5852)

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-02 00:57:09 -07:00
parent 2c37540aa6
commit 98d6682cd1
35 changed files with 329 additions and 751 deletions
--- a/vllm/model_executor/models/phi3v.py
+++ b/vllm/model_executor/models/phi3v.py
@@ -35,10 +35,9 @@ from vllm.model_executor.models.clip import CLIPVisionModel
 from vllm.model_executor.models.llama import LlamaModel
 from vllm.model_executor.sampling_metadata import SamplingMetadata
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import ImagePixelData
 from vllm.sequence import SamplerOutput

-from .clip import dummy_pixel_data_for_clip, dummy_seq_data_for_clip
+from .clip import dummy_image_for_clip, dummy_seq_data_for_clip
 from .interfaces import SupportsVision

 logger = init_logger(__name__)
@@ -286,7 +285,7 @@ def dummy_data_for_phi3v(ctx: InputContext, seq_len: int):
        image_token_id=32044,
        image_feature_size_override=image_feature_size,
    )
-    mm_data = dummy_pixel_data_for_clip(
+    mm_data = dummy_image_for_clip(
        CLIP_VIT_LARGE_PATCH14_336_CONFIG,
        image_width_override=dummy_width,
        image_height_override=dummy_height,
@@ -331,8 +330,7 @@ def _calc_hd_transform_size(*, width: int, height: int, hd_num: int = 16):


 def _image_processor(ctx: InputContext,
-                     data: ImagePixelData) -> Dict[str, torch.Tensor]:
-    image = data.image
+                     image: object) -> Dict[str, torch.Tensor]:

    if isinstance(image, Image.Image):
        # Temporary patch before dynamic number of image tokens is supported
@@ -343,13 +341,14 @@ def _image_processor(ctx: InputContext,
                "Dynamic image shape is currently not supported. "
                "Resizing input image to (%d, %d).", w, h)

-            data.image = image.resize((w, h))
+            image = image.resize((w, h))

-    return MULTIMODAL_REGISTRY._get_plugin_for_data_type(ImagePixelData) \
-            ._default_input_mapper(ctx, data)
+        return MULTIMODAL_REGISTRY._get_plugin("image") \
+                ._default_input_mapper(ctx, image)
+    raise TypeError(f"Invalid type for 'image': {type(image)}")


-@MULTIMODAL_REGISTRY.register_image_pixel_input_mapper(_image_processor)
+@MULTIMODAL_REGISTRY.register_image_input_mapper(_image_processor)
@INPUT_REGISTRY.register_dummy_data(dummy_data_for_phi3v)
 class Phi3VForCausalLM(nn.Module, SupportsVision):

@@ -375,14 +374,6 @@ class Phi3VForCausalLM(nn.Module, SupportsVision):
        pixel_values = kwargs.pop("pixel_values", None)
        image_sizes = kwargs.pop("image_sizes", None)

-        expected_input_type = self.vlm_config.image_input_type
-        ImageInputType = VisionLanguageConfig.ImageInputType
-
-        if expected_input_type != ImageInputType.PIXEL_VALUES:
-            raise ValueError(
-                f"Unexpected image input type: {expected_input_type}."
-                "Phi3v only support pixel_values input currently.")
-
        if pixel_values is not None and image_sizes is not None:
            return Phi3VImagePixelInputs(type="pixel_values",
                                         data=pixel_values,