[CI/Build] Add TP test for vision models (#5892)
This commit is contained in:
@@ -1,4 +1,4 @@
|
||||
from typing import List, Tuple
|
||||
from typing import List, Optional, Tuple, Type
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
@@ -6,7 +6,7 @@ from transformers import AutoTokenizer
|
||||
from vllm.config import VisionLanguageConfig
|
||||
from vllm.utils import is_cpu
|
||||
|
||||
from ..conftest import IMAGE_ASSETS
|
||||
from ..conftest import IMAGE_ASSETS, HfRunner, VllmRunner, _ImageAssets
|
||||
|
||||
pytestmark = pytest.mark.vlm
|
||||
|
||||
@@ -73,17 +73,17 @@ if is_cpu():
|
||||
target_dtype = "bfloat16"
|
||||
|
||||
|
||||
# TODO: Add test for `tensor_parallel_size` [ref: PR #3883]
|
||||
# Since we use _attn_implementation="eager" for hf_runner, here is
|
||||
# numeric difference for longer context and test can't pass
|
||||
@pytest.mark.xfail(
|
||||
reason="Inconsistent image processor being used due to lack "
|
||||
"of support for dynamic image token replacement")
|
||||
@pytest.mark.parametrize("model_and_config", model_and_vl_config)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
|
||||
dtype: str, max_tokens: int) -> None:
|
||||
def run_test(
|
||||
hf_runner: Type[HfRunner],
|
||||
vllm_runner: Type[VllmRunner],
|
||||
image_assets: _ImageAssets,
|
||||
model_and_config: Tuple[str, VisionLanguageConfig],
|
||||
*,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
tensor_parallel_size: int,
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
):
|
||||
"""Inference result should be the same between hf and vllm.
|
||||
|
||||
All the image fixtures for the test is under tests/images.
|
||||
@@ -116,7 +116,9 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
|
||||
with vllm_runner(model_id,
|
||||
max_model_len=2048,
|
||||
dtype=dtype,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enforce_eager=True,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
**vlm_config.as_cli_args_dict()) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy(vllm_image_prompts,
|
||||
max_tokens,
|
||||
@@ -130,3 +132,24 @@ def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
|
||||
f"Test{i}:\nHF: {hf_output_str!r}\nvLLM: {vllm_output_str!r}")
|
||||
assert hf_output_ids == vllm_output_ids, (
|
||||
f"Test{i}:\nHF: {hf_output_ids}\nvLLM: {vllm_output_ids}")
|
||||
|
||||
|
||||
# Since we use _attn_implementation="eager" for hf_runner, here is
|
||||
# numeric difference for longer context and test can't pass
|
||||
@pytest.mark.xfail(
|
||||
reason="Inconsistent image processor being used due to lack "
|
||||
"of support for dynamic image token replacement")
|
||||
@pytest.mark.parametrize("model_and_config", model_and_vl_config)
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model_and_config,
|
||||
dtype: str, max_tokens: int) -> None:
|
||||
run_test(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model_and_config,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
tensor_parallel_size=1,
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user