[Bugfix] Fix broadcasting logic for multi_modal_kwargs (#6836)

2024-07-31 10:38:45 +08:00
parent da1f7cc12a
commit f230cc2ca6
16 changed files with 254 additions and 211 deletions
--- a/tests/distributed/test_multimodal_broadcast.py
+++ b/tests/distributed/test_multimodal_broadcast.py
@@ -19,10 +19,10 @@ from vllm.utils import cuda_device_count_stateless

 model = os.environ["TEST_DIST_MODEL"]

-if model.startswith("llava-hf/llava"):
+if model.startswith("llava-hf/llava-1.5"):
    from ..models.test_llava import models, run_test
-elif model.startswith("microsoft/Phi-3-vision"):
-    from ..models.test_phi3v import models, run_test
+elif model.startswith("llava-hf/llava-v1.6"):
+    from ..models.test_llava_next import models, run_test
 else:
    raise NotImplementedError(f"Unsupported model: {model}")

@@ -45,7 +45,8 @@ def test_models(hf_runner, vllm_runner, image_assets,
        vllm_runner,
        image_assets,
        model=models[0],
-        size_factors=[1.0],
+        # So that LLaVA-NeXT processor may return nested list
+        size_factors=[0.25, 0.5, 1.0],
        dtype=dtype,
        max_tokens=max_tokens,
        num_logprobs=num_logprobs,
--- a/tests/distributed/test_parallel_state.py
+++ b/tests/distributed/test_parallel_state.py
@@ -1,57 +0,0 @@
-from typing import Any, Dict
-
-import pytest
-import torch
-
-from vllm.distributed.parallel_state import (_split_tensor_dict,
-                                             _update_nested_dict)
-
-
-def test_split_tensor_dict():
-    test_dict = {
-        "key_a": "a",
-        "key_b": torch.arange(8, dtype=torch.float32),
-        "key_c": {
-            "key_1": torch.arange(5, dtype=torch.float32),
-            "key_2": torch.tensor([], dtype=torch.float32),
-            "key_3": 123,
-        },
-        "key_d": {},
-    }
-    metadata_list, tensor_list = _split_tensor_dict(test_dict)
-    assert len(metadata_list) == 6
-    assert torch.allclose(tensor_list[0], test_dict["key_b"])
-    assert torch.allclose(tensor_list[1], test_dict["key_c"]["key_1"])
-    assert torch.allclose(tensor_list[2], test_dict["key_c"]["key_2"])
-
-
-def test_split_tensor_dict_invalid_key():
-    test_dict = {
-        "a%b": "a",
-    }
-    with pytest.raises(AssertionError):
-        _split_tensor_dict(test_dict)
-
-
-def test_update_nested_dict():
-    flattened_keys_values = [("key1%key2%key3", "value1"),
-                             ("key1%key2%key4", "value2"),
-                             ("key1%key5", "value3"), ("key6%key7", "value4"),
-                             ("key8", "value5")]
-    res: Dict[str, Any] = {}
-
-    for flat_key, value in flattened_keys_values:
-        _update_nested_dict(res, flat_key, value)
-    assert res == {
-        "key1": {
-            "key2": {
-                "key3": "value1",
-                "key4": "value2"
-            },
-            "key5": "value3"
-        },
-        "key6": {
-            "key7": "value4"
-        },
-        "key8": "value5"
-    }
--- a/tests/models/test_llava_next.py
+++ b/tests/models/test_llava_next.py
@@ -1,14 +1,12 @@
-from typing import List, Optional, Tuple
+from typing import List, Optional, Tuple, Type

 import pytest
 from transformers import AutoConfig, AutoTokenizer

-from vllm.model_executor.models.llava_next import (
-    get_llava_next_image_feature_size)
 from vllm.multimodal.utils import rescale_image_size
 from vllm.sequence import SampleLogprobs

-from ..conftest import IMAGE_ASSETS
+from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
 from .utils import check_logprobs_close

 pytestmark = pytest.mark.vlm
@@ -27,6 +25,8 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({

 IMAGE_TOKEN_ID = 32000

+models = ["llava-hf/llava-v1.6-vicuna-7b-hf"]
+

 def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
                                         Optional[SampleLogprobs]],
@@ -50,34 +50,19 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
    return hf_output_ids, hf_output_str, out_logprobs


-@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-vicuna-7b-hf"])
-@pytest.mark.parametrize(
-    "size_factors",
-    [
-        # No image
-        [],
-        # Single-scale
-        [1.0],
-        # Single-scale, batched
-        [1.0, 1.0, 1.0],
-        # Multi-scale
-        [0.25, 0.5, 1.0],
-    ],
-)
-@pytest.mark.parametrize("dtype", ["half"])
-@pytest.mark.parametrize("max_tokens", [128])
-@pytest.mark.parametrize("num_logprobs", [5])
-def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
-                dtype, max_tokens, num_logprobs) -> None:
-    """Inference result should be the same between hf and vllm.
-
-    All the image fixtures for the test is under tests/images.
-    For huggingface runner, we provide the PIL images as input.
-    For vllm runner, we provide MultiModalDataDict objects 
-    and corresponding vision language config as input.
-    Note, the text input is also adjusted to abide by vllm contract.
-    The text output is sanitized to be able to compare with hf.
-    """
+def run_test(
+    hf_runner: Type[HfRunner],
+    vllm_runner: Type[VllmRunner],
+    image_assets: _ImageAssets,
+    model: str,
+    *,
+    size_factors: List[float],
+    dtype: str,
+    max_tokens: int,
+    num_logprobs: int,
+    tensor_parallel_size: int,
+    distributed_executor_backend: Optional[str] = None,
+):
    images = [asset.pil_image for asset in image_assets]

    inputs_per_image = [(
@@ -89,6 +74,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
    with vllm_runner(model,
                     dtype=dtype,
                     max_model_len=4096,
+                     tensor_parallel_size=tensor_parallel_size,
+                     distributed_executor_backend=distributed_executor_backend,
                     enforce_eager=True) as vllm_model:
        vllm_outputs_per_image = [
            vllm_model.generate_greedy_logprobs(prompts,
@@ -122,9 +109,54 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
        )


+@pytest.mark.parametrize("model", models)
+@pytest.mark.parametrize(
+    "size_factors",
+    [
+        # No image
+        [],
+        # Single-scale
+        [1.0],
+        # Single-scale, batched
+        [1.0, 1.0, 1.0],
+        # Multi-scale
+        [0.25, 0.5, 1.0],
+    ],
+)
+@pytest.mark.parametrize("dtype", ["half"])
+@pytest.mark.parametrize("max_tokens", [128])
+@pytest.mark.parametrize("num_logprobs", [5])
+def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
+                dtype, max_tokens, num_logprobs) -> None:
+    """Inference result should be the same between hf and vllm.
+
+    All the image fixtures for the test is under tests/images.
+    For huggingface runner, we provide the PIL images as input.
+    For vllm runner, we provide MultiModalDataDict objects 
+    and corresponding vision language config as input.
+    Note, the text input is also adjusted to abide by vllm contract.
+    The text output is sanitized to be able to compare with hf.
+    """
+    run_test(
+        hf_runner,
+        vllm_runner,
+        image_assets,
+        model,
+        size_factors=size_factors,
+        dtype=dtype,
+        max_tokens=max_tokens,
+        num_logprobs=num_logprobs,
+        tensor_parallel_size=1,
+    )
+
+
@pytest.mark.parametrize("height_and_width_and_result", [(1669, 2560, 2144),
                                                         (183, 488, 776)])
 def test_image_feature_size(height_and_width_and_result):
+    # Avoid initializing CUDA too early in distributed tests
+    from vllm.model_executor.models.llava_next import (
+        get_llava_next_image_feature_size)
+
    height, width, result = height_and_width_and_result
    config = AutoConfig.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
    assert get_llava_next_image_feature_size(config,