[VLM] Merged multi-modal processor for InternVL-based models (#12553)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> Signed-off-by: Isotr0py <2037008807@qq.com> Co-authored-by: Isotr0py <2037008807@qq.com>
2025-02-04 16:44:52 +08:00
parent 96b23621c1
commit d1ca7df84d
34 changed files with 1469 additions and 1021 deletions
--- a/tests/models/decoder_only/vision_language/test_h2ovl.py
+++ b/tests/models/decoder_only/vision_language/test_h2ovl.py
@@ -1,131 +0,0 @@
-# SPDX-License-Identifier: Apache-2.0
-
-from typing import Optional, Tuple
-
-import pytest
-import torch
-from PIL.Image import Image
-from transformers import AutoConfig
-
-# Import the functions to test
-from vllm.model_executor.models.h2ovl import (calculate_num_blocks,
-                                              image_to_pixel_values_wrapper)
-from vllm.multimodal.image import rescale_image_size
-
-models = [
-    "h2oai/h2ovl-mississippi-800m",  # Replace with your actual model names
-    "h2oai/h2ovl-mississippi-2b",
-]
-
-
-def run_preprocessing_test(
-    image: Image,
-    config,
-    max_dynamic_patch: Optional[int] = None,
-) -> Tuple[torch.Tensor, int]:
-    """Test the image preprocessing and calculate expected blocks."""
-
-    if max_dynamic_patch is None:
-        max_dynamic_patch = config.max_dynamic_patch
-
-    width, height = image.size
-    use_MSAC = config.use_msac
-
-    # Create the mapper function with the provided configuration
-    mapper = image_to_pixel_values_wrapper(config, max_dynamic_patch, use_MSAC)
-    pixel_values = mapper(image)
-
-    # Calculate the expected number of blocks
-    if use_MSAC:
-        # First pass
-        blocks1, _, _, aspect_ratio = calculate_num_blocks(
-            width,
-            height,
-            config.min_dynamic_patch,
-            max_dynamic_patch,
-            config.vision_config.image_size,
-            use_thumbnail=False,  # Thumbnail is handled separately
-            prior_aspect_ratio=None,
-        )
-
-        # Second pass
-        blocks2, _, _, _ = calculate_num_blocks(
-            width,
-            height,
-            config.min_dynamic_patch,
-            max_dynamic_patch,
-            config.vision_config.image_size,
-            use_thumbnail=False,
-            prior_aspect_ratio=aspect_ratio,
-        )
-
-        # Add thumbnail if use_thumbnail is True and total_blocks > 1
-        if config.use_thumbnail:
-            blocks1 += 1 if blocks1 > 1 else 0
-            blocks2 += 1 if blocks2 > 1 else 0
-
-        # Total blocks is the sum of blocks from both passes minus overlapping
-        total_blocks = blocks1 + blocks2 - 1
-
-        expected_blocks = total_blocks
-
-    else:
-        blocks, _, _, _ = calculate_num_blocks(
-            width,
-            height,
-            config.min_dynamic_patch,
-            max_dynamic_patch,
-            config.vision_config.image_size,
-            use_thumbnail=False,
-            prior_aspect_ratio=None,
-        )
-        expected_blocks = blocks
-
-        if config.use_thumbnail and expected_blocks > 1:
-            expected_blocks += 1
-
-    return pixel_values, expected_blocks
-
-
-@pytest.mark.parametrize("model_name", models)
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("max_dynamic_patch", [None, 2, 4, 8])
-def test_image_preprocessing(image_assets, model_name, size_factors,
-                             max_dynamic_patch):
-    """Test image preprocessing pipeline with different configurations."""
-    # Load the configuration from the model
-    config = AutoConfig.from_pretrained(model_name, trust_remote_code=True)
-
-    for asset in image_assets:
-        image = asset.pil_image
-        for factor in size_factors:
-            scaled_image = rescale_image_size(image, factor)
-
-            # Test preprocessing and get expected number of blocks
-            pixel_values, expected_blocks = run_preprocessing_test(
-                scaled_image, config, max_dynamic_patch)
-
-            # Verify output shapes and properties
-            actual_blocks = pixel_values.shape[0]
-            assert actual_blocks == expected_blocks, (
-                f"Expected {expected_blocks} blocks, got {actual_blocks}")
-
-            # Check image dimensions
-            expected_size = (
-                3,  # Number of channels (C, H, W)
-                config.vision_config.image_size,
-                config.vision_config.image_size,
-            )
-            for img in pixel_values:
-                assert img.shape == expected_size, (
-                    f"Expected image size {expected_size}, got {img.shape}")
--- a/tests/models/decoder_only/vision_language/test_models.py
+++ b/tests/models/decoder_only/vision_language/test_models.py
@@ -250,6 +250,7 @@ VLM_TEST_SETTINGS = {
        max_model_len=8192,
        dtype="bfloat16",
        use_tokenizer_eos=True,
+        num_logprobs=10,
        patch_hf_runner=model_utils.h2ovl_patch_hf_runner,
    ),
    "idefics3": VLMTestInfo(
@@ -282,7 +283,6 @@ VLM_TEST_SETTINGS = {
        dtype="bfloat16",
        use_tokenizer_eos=True,
        patch_hf_runner=model_utils.internvl_patch_hf_runner,
-        marks=[large_gpu_mark(min_gb=32)],
    ),
    "llava_next": VLMTestInfo(
        models=["llava-hf/llava-v1.6-mistral-7b-hf"],
--- a/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
+++ b/tests/models/decoder_only/vision_language/vlm_utils/model_utils.py
@@ -334,12 +334,12 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        def __init__(self, hf_runner: HfRunner):
            self.num_image_token = hf_runner.model.num_image_token
            self.tokenizer = hf_runner.tokenizer
-            self.dtype = hf_runner.model.dtype

            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
                                                     trust_remote_code=True)
            self.vision_config = self.config.vision_config
            self.use_thumbnail = self.config.use_thumbnail
+            self.use_msac = self.config.use_msac
            self.min_num = self.config.min_dynamic_patch
            self.max_num = self.config.max_dynamic_patch
            self.image_size = self.vision_config.image_size
@@ -348,18 +348,19 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
                     **kwargs):
            # yapf: disable
            from vllm.model_executor.models.h2ovl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)

            # yapf: enable
            images = [images] if isinstance(images, Image) else images
            pixel_values = [
-                image_to_pixel_values(image,
-                                      self.image_size,
-                                      self.min_num,
-                                      self.max_num,
-                                      self.use_thumbnail,
-                                      use_MSAC=self.config.use_msac).to(
-                                          self.dtype) for image in images
+                image_to_pixel_values_h2ovl(
+                    image,
+                    input_size=self.image_size,
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                    use_msac=self.use_msac,
+                ) for image in images
            ]
            num_patches_list = [
                pixel_value.shape[0] for pixel_value in pixel_values
@@ -394,7 +395,6 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        def __init__(self, hf_runner: HfRunner):
            self.num_image_token = hf_runner.model.num_image_token
            self.tokenizer = hf_runner.tokenizer
-            self.dtype = hf_runner.model.dtype

            self.config = AutoConfig.from_pretrained(hf_runner.model_name,
                                                     trust_remote_code=True)
@@ -407,13 +407,17 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
        def __call__(self, text: str, images: Union[Image, List[Image]],
                     **kwargs):
            from vllm.model_executor.models.internvl import (
-                IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values)
+                IMG_CONTEXT, IMG_END, IMG_START,
+                image_to_pixel_values_internvl)
            images = [images] if isinstance(images, Image) else images
            pixel_values = [
-                image_to_pixel_values(image, self.image_size, self.min_num,
-                                      self.max_num,
-                                      self.use_thumbnail).to(self.dtype)
-                for image in images
+                image_to_pixel_values_internvl(
+                    image,
+                    input_size=self.image_size,
+                    min_num=self.min_num,
+                    max_num=self.max_num,
+                    use_thumbnail=self.use_thumbnail,
+                ) for image in images
            ]
            num_patches_list = [
                pixel_value.shape[0] for pixel_value in pixel_values
@@ -448,7 +452,8 @@ def _internvl_generate(
 ) -> torch.LongTensor:
    """Generate method for InternVL2 model without fixed use_cache."""
    assert self.img_context_token_id is not None
-    vit_embeds = self.extract_feature(pixel_values)
+    target_dtype = next(self.parameters()).dtype
+    vit_embeds = self.extract_feature(pixel_values.to(target_dtype))
    input_embeds = self.language_model.get_input_embeddings()(input_ids)
    B, N, C = input_embeds.shape
    input_embeds = input_embeds.reshape(B * N, C)