[Bugfix] Fix broadcasting logic for multi_modal_kwargs (#6836)
This commit is contained in:
@@ -19,10 +19,10 @@ from vllm.utils import cuda_device_count_stateless
|
||||
|
||||
model = os.environ["TEST_DIST_MODEL"]
|
||||
|
||||
if model.startswith("llava-hf/llava"):
|
||||
if model.startswith("llava-hf/llava-1.5"):
|
||||
from ..models.test_llava import models, run_test
|
||||
elif model.startswith("microsoft/Phi-3-vision"):
|
||||
from ..models.test_phi3v import models, run_test
|
||||
elif model.startswith("llava-hf/llava-v1.6"):
|
||||
from ..models.test_llava_next import models, run_test
|
||||
else:
|
||||
raise NotImplementedError(f"Unsupported model: {model}")
|
||||
|
||||
@@ -45,7 +45,8 @@ def test_models(hf_runner, vllm_runner, image_assets,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model=models[0],
|
||||
size_factors=[1.0],
|
||||
# So that LLaVA-NeXT processor may return nested list
|
||||
size_factors=[0.25, 0.5, 1.0],
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
|
||||
@@ -1,57 +0,0 @@
|
||||
from typing import Any, Dict
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.distributed.parallel_state import (_split_tensor_dict,
|
||||
_update_nested_dict)
|
||||
|
||||
|
||||
def test_split_tensor_dict():
|
||||
test_dict = {
|
||||
"key_a": "a",
|
||||
"key_b": torch.arange(8, dtype=torch.float32),
|
||||
"key_c": {
|
||||
"key_1": torch.arange(5, dtype=torch.float32),
|
||||
"key_2": torch.tensor([], dtype=torch.float32),
|
||||
"key_3": 123,
|
||||
},
|
||||
"key_d": {},
|
||||
}
|
||||
metadata_list, tensor_list = _split_tensor_dict(test_dict)
|
||||
assert len(metadata_list) == 6
|
||||
assert torch.allclose(tensor_list[0], test_dict["key_b"])
|
||||
assert torch.allclose(tensor_list[1], test_dict["key_c"]["key_1"])
|
||||
assert torch.allclose(tensor_list[2], test_dict["key_c"]["key_2"])
|
||||
|
||||
|
||||
def test_split_tensor_dict_invalid_key():
|
||||
test_dict = {
|
||||
"a%b": "a",
|
||||
}
|
||||
with pytest.raises(AssertionError):
|
||||
_split_tensor_dict(test_dict)
|
||||
|
||||
|
||||
def test_update_nested_dict():
|
||||
flattened_keys_values = [("key1%key2%key3", "value1"),
|
||||
("key1%key2%key4", "value2"),
|
||||
("key1%key5", "value3"), ("key6%key7", "value4"),
|
||||
("key8", "value5")]
|
||||
res: Dict[str, Any] = {}
|
||||
|
||||
for flat_key, value in flattened_keys_values:
|
||||
_update_nested_dict(res, flat_key, value)
|
||||
assert res == {
|
||||
"key1": {
|
||||
"key2": {
|
||||
"key3": "value1",
|
||||
"key4": "value2"
|
||||
},
|
||||
"key5": "value3"
|
||||
},
|
||||
"key6": {
|
||||
"key7": "value4"
|
||||
},
|
||||
"key8": "value5"
|
||||
}
|
||||
@@ -1,14 +1,12 @@
|
||||
from typing import List, Optional, Tuple
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
import pytest
|
||||
from transformers import AutoConfig, AutoTokenizer
|
||||
|
||||
from vllm.model_executor.models.llava_next import (
|
||||
get_llava_next_image_feature_size)
|
||||
from vllm.multimodal.utils import rescale_image_size
|
||||
from vllm.sequence import SampleLogprobs
|
||||
|
||||
from ..conftest import IMAGE_ASSETS
|
||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
from .utils import check_logprobs_close
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
@@ -27,6 +25,8 @@ HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
|
||||
IMAGE_TOKEN_ID = 32000
|
||||
|
||||
models = ["llava-hf/llava-v1.6-vicuna-7b-hf"]
|
||||
|
||||
|
||||
def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
||||
Optional[SampleLogprobs]],
|
||||
@@ -50,34 +50,19 @@ def vllm_to_hf_output(vllm_output: Tuple[List[int], str,
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", ["llava-hf/llava-v1.6-vicuna-7b-hf"])
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
dtype, max_tokens, num_logprobs) -> None:
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding vision language config as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
def run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets,
|
||||
model: str,
|
||||
*,
|
||||
size_factors: List[float],
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [(
|
||||
@@ -89,6 +74,8 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_outputs_per_image = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
@@ -122,9 +109,54 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
# No image
|
||||
[],
|
||||
# Single-scale
|
||||
[1.0],
|
||||
# Single-scale, batched
|
||||
[1.0, 1.0, 1.0],
|
||||
# Multi-scale
|
||||
[0.25, 0.5, 1.0],
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
dtype, max_tokens, num_logprobs) -> None:
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
For huggingface runner, we provide the PIL images as input.
|
||||
For vllm runner, we provide MultiModalDataDict objects
|
||||
and corresponding vision language config as input.
|
||||
Note, the text input is also adjusted to abide by vllm contract.
|
||||
The text output is sanitized to be able to compare with hf.
|
||||
"""
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors=size_factors,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("height_and_width_and_result", [(1669, 2560, 2144),
|
||||
(183, 488, 776)])
|
||||
def test_image_feature_size(height_and_width_and_result):
|
||||
# Avoid initializing CUDA too early in distributed tests
|
||||
from vllm.model_executor.models.llava_next import (
|
||||
get_llava_next_image_feature_size)
|
||||
|
||||
height, width, result = height_and_width_and_result
|
||||
config = AutoConfig.from_pretrained("llava-hf/llava-v1.6-mistral-7b-hf")
|
||||
assert get_llava_next_image_feature_size(config,
|
||||
|
||||
Reference in New Issue
Block a user