[VLM] Remove image_input_type from VLM config (#5852)

Signed-off-by: Xiaowei Jiang <xwjiang2010@gmail.com> Co-authored-by: Cyrus Leung <cyrus.tl.leung@gmail.com> Co-authored-by: Roger Wang <ywang@roblox.com>
2024-07-02 00:57:09 -07:00
parent 2c37540aa6
commit 98d6682cd1
35 changed files with 329 additions and 751 deletions
--- a/tests/multimodal/test_mapper.py
+++ b/tests/multimodal/test_mapper.py
@@ -2,9 +2,8 @@ import numpy as np
 import pytest
 from transformers import CLIPImageProcessor, LlavaNextImageProcessor

-from vllm.config import ModelConfig, VisionLanguageConfig
+from vllm.config import ModelConfig
 from vllm.multimodal import MULTIMODAL_REGISTRY
-from vllm.multimodal.image import ImagePixelData

 from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE

@@ -12,7 +11,6 @@ from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE
@pytest.mark.parametrize("dtype", ["half", "float"])
 def test_clip_image_processor(image_assets, dtype):
    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 560

    hf_processor = CLIPImageProcessor.from_pretrained(MODEL_NAME)
    assert isinstance(hf_processor, CLIPImageProcessor)
@@ -25,14 +23,6 @@ def test_clip_image_processor(image_assets, dtype):
        seed=0,
        dtype=dtype,
        revision=None,
-        multimodal_config=VisionLanguageConfig(
-            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-            image_token_id=32000,
-            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-            image_feature_size=576,
-            image_processor=MODEL_NAME,
-            image_processor_revision=None,
-        ),
    )

    for asset in image_assets:
@@ -42,7 +32,7 @@ def test_clip_image_processor(image_assets, dtype):
        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
        vllm_result = MULTIMODAL_REGISTRY.map_input(
            model_config,
-            ImagePixelData(asset.pil_image),
+            {"image": asset.pil_image},
        )

        assert hf_result.keys() == vllm_result.keys()
@@ -60,7 +50,6 @@ def test_clip_image_processor(image_assets, dtype):
@pytest.mark.parametrize("dtype", ["half", "float"])
 def test_llava_next_image_processor(image_assets, dtype):
    MODEL_NAME = "llava-hf/llava-v1.6-34b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 560

    hf_processor = LlavaNextImageProcessor.from_pretrained(MODEL_NAME)
    assert isinstance(hf_processor, LlavaNextImageProcessor)
@@ -73,14 +62,6 @@ def test_llava_next_image_processor(image_assets, dtype):
        seed=0,
        dtype=dtype,
        revision=None,
-        multimodal_config=VisionLanguageConfig(
-            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-            image_token_id=64000,
-            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-            image_feature_size=2928,
-            image_processor=MODEL_NAME,
-            image_processor_revision=None,
-        ),
    )

    for asset in image_assets:
@@ -90,7 +71,7 @@ def test_llava_next_image_processor(image_assets, dtype):
        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
        vllm_result = MULTIMODAL_REGISTRY.map_input(
            model_config,
-            ImagePixelData(asset.pil_image),
+            {"image": asset.pil_image},
        )

        assert hf_result.keys() == vllm_result.keys()
@@ -107,7 +88,6 @@ def test_llava_next_image_processor(image_assets, dtype):
@pytest.mark.parametrize("dtype", ["float"])
 def test_image_pixel_types(image_assets, dtype):
    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
-    IMAGE_HEIGHT = IMAGE_WIDTH = 560

    model_config = ModelConfig(
        model=MODEL_NAME,
@@ -117,23 +97,15 @@ def test_image_pixel_types(image_assets, dtype):
        seed=0,
        dtype=dtype,
        revision=None,
-        multimodal_config=VisionLanguageConfig(
-            image_input_type=VisionLanguageConfig.ImageInputType.PIXEL_VALUES,
-            image_token_id=32000,
-            image_input_shape=(1, 3, IMAGE_HEIGHT, IMAGE_WIDTH),
-            image_feature_size=576,
-            image_processor=MODEL_NAME,
-            image_processor_revision=None,
-        ))
-
+    )
    for asset in image_assets:
        image_result = MULTIMODAL_REGISTRY.map_input(
            model_config,
-            ImagePixelData(asset.pil_image),
+            {"image": asset.pil_image},
        )
        tensor_result = MULTIMODAL_REGISTRY.map_input(
            model_config,
-            ImagePixelData(asset.pixel_values),
+            {"image": asset.pil_image},
        )

        assert image_result.keys() == tensor_result.keys()