[CI/Build] Refactor image test assets (#5821)

2024-06-26 16:02:34 +08:00
parent 3439c5a8e3
commit 6984c02a27
5 changed files with 127 additions and 92 deletions
--- a/tests/multimodal/test_processor.py
+++ b/tests/multimodal/test_processor.py
@@ -10,7 +10,7 @@ from ..conftest import _STR_DTYPE_TO_TORCH_DTYPE


@pytest.mark.parametrize("dtype", ["half", "float"])
-def test_clip_image_processor(hf_images, dtype):
+def test_clip_image_processor(image_assets, dtype):
    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
    IMAGE_HEIGHT = IMAGE_WIDTH = 560

@@ -35,13 +35,13 @@ def test_clip_image_processor(hf_images, dtype):
        image_processor_revision=None,
    )

-    for image in hf_images:
+    for asset in image_assets:
        hf_result = hf_processor.preprocess(
-            image,
+            asset.pil_image,
            return_tensors="pt",
        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
        vllm_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(image),
+            ImagePixelData(asset.pil_image),
            model_config=model_config,
            vlm_config=vlm_config,
        )
@@ -59,7 +59,7 @@ def test_clip_image_processor(hf_images, dtype):
    reason="Inconsistent image processor being used due to lack "
    "of support for dynamic image token replacement")
@pytest.mark.parametrize("dtype", ["half", "float"])
-def test_llava_next_image_processor(hf_images, dtype):
+def test_llava_next_image_processor(image_assets, dtype):
    MODEL_NAME = "llava-hf/llava-v1.6-34b-hf"
    IMAGE_HEIGHT = IMAGE_WIDTH = 560

@@ -84,13 +84,13 @@ def test_llava_next_image_processor(hf_images, dtype):
        image_processor_revision=None,
    )

-    for image in hf_images:
+    for asset in image_assets:
        hf_result = hf_processor.preprocess(
-            image,
+            asset.pil_image,
            return_tensors="pt",
        ).to(dtype=_STR_DTYPE_TO_TORCH_DTYPE[dtype])
        vllm_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(image),
+            ImagePixelData(asset.pil_image),
            model_config=model_config,
            vlm_config=vlm_config,
        )
@@ -107,7 +107,7 @@ def test_llava_next_image_processor(hf_images, dtype):
@pytest.mark.xfail(
    reason="Example image pixels were not processed using HuggingFace")
@pytest.mark.parametrize("dtype", ["float"])
-def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
+def test_image_pixel_types(image_assets, dtype):
    MODEL_NAME = "llava-hf/llava-1.5-7b-hf"
    IMAGE_HEIGHT = IMAGE_WIDTH = 560

@@ -129,14 +129,14 @@ def test_image_pixel_types(hf_images, vllm_image_tensors, dtype):
        image_processor_revision=None,
    )

-    for image, tensor in zip(hf_images, vllm_image_tensors):
+    for asset in image_assets:
        image_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(image),
+            ImagePixelData(asset.pil_image),
            model_config=model_config,
            vlm_config=vlm_config,
        )
        tensor_result = MULTIMODAL_REGISTRY.process_input(
-            ImagePixelData(tensor),
+            ImagePixelData(asset.pixel_values),
            model_config=model_config,
            vlm_config=vlm_config,
        )