[V0 Deprecation] Enable the remaining multimodal tests in V1 (#25307)

Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
Cyrus Leung
2025-09-21 01:50:58 +08:00
committed by GitHub
parent d88918e4c2
commit bef180f009
8 changed files with 195 additions and 214 deletions

View File

@@ -32,13 +32,6 @@ from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
if current_platform.is_rocm():
os.environ["VLLM_USE_TRITON_FLASH_ATTN"] = "0"
REQUIRES_V0_MODELS = [
# V1 Test: not enough KV cache space in C1.
"fuyu",
# V1 Test: Deadlock issue when processing mm_inputs
"llava-onevision-transformers",
]
# yapf: disable
COMMON_BROADCAST_SETTINGS = {
"test_type": VLMTestType.IMAGE,
@@ -186,8 +179,11 @@ VLM_TEST_SETTINGS = {
image_size_factors=[(0.25, 0.5, 1.0)],
vllm_runner_kwargs={
"model_impl": "transformers",
"default_torch_num_threads": 1,
},
marks=[pytest.mark.core_model],
# FIXME: Investigate why the test hangs
# when processing the 3rd prompt in vLLM
marks=[pytest.mark.core_model, pytest.mark.skip(reason="Test hangs")],
),
"idefics3-transformers": VLMTestInfo(
models=["HuggingFaceTB/SmolVLM-256M-Instruct"],
@@ -320,6 +316,7 @@ VLM_TEST_SETTINGS = {
vllm_output_post_proc=model_utils.fuyu_vllm_to_hf_output,
num_logprobs=10,
image_size_factors=[(), (0.25,), (0.25, 0.25, 0.25), (0.25, 0.2, 0.15)],
marks=[large_gpu_mark(min_gb=32)],
),
"gemma3": VLMTestInfo(
models=["google/gemma-3-4b-it"],
@@ -861,13 +858,14 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
test_type=VLMTestType.IMAGE,
create_new_process_for_each_test=False,
))
def test_single_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
def test_single_image_models(
tmp_path: PosixPath,
model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_single_image_test(
tmp_path=tmp_path,
@@ -886,13 +884,14 @@ def test_single_image_models(tmp_path: PosixPath, model_type: str,
test_type=VLMTestType.MULTI_IMAGE,
create_new_process_for_each_test=False,
))
def test_multi_image_models(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
def test_multi_image_models(
tmp_path: PosixPath,
model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_multi_image_test(
tmp_path=tmp_path,
@@ -911,13 +910,13 @@ def test_multi_image_models(tmp_path: PosixPath, model_type: str,
test_type=VLMTestType.EMBEDDING,
create_new_process_for_each_test=False,
))
def test_image_embedding_models(model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
def test_image_embedding_models(
model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_embedding_test(
model_test_info=model_test_info,
@@ -935,11 +934,13 @@ def test_image_embedding_models(model_type: str,
test_type=VLMTestType.VIDEO,
create_new_process_for_each_test=False,
))
def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
video_assets: VideoTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
def test_video_models(
model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
video_assets: VideoTestAssets,
):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_video_test(
model_test_info=model_test_info,
@@ -957,11 +958,13 @@ def test_video_models(model_type: str, test_case: ExpandableVLMTestArgs,
test_type=VLMTestType.AUDIO,
create_new_process_for_each_test=False,
))
def test_audio_models(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner], vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
def test_audio_models(
model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_audio_test(
model_test_info=model_test_info,
@@ -984,10 +987,7 @@ def test_custom_inputs_models(
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
monkeypatch,
):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_custom_inputs_test(
model_test_info=model_test_info,
@@ -1006,13 +1006,14 @@ def test_custom_inputs_models(
create_new_process_for_each_test=True,
))
@create_new_process_for_each_test()
def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
def test_single_image_models_heavy(
tmp_path: PosixPath,
model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_single_image_test(
tmp_path=tmp_path,
@@ -1032,13 +1033,14 @@ def test_single_image_models_heavy(tmp_path: PosixPath, model_type: str,
create_new_process_for_each_test=True,
))
@create_new_process_for_each_test()
def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
def test_multi_image_models_heavy(
tmp_path: PosixPath,
model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_multi_image_test(
tmp_path=tmp_path,
@@ -1058,14 +1060,13 @@ def test_multi_image_models_heavy(tmp_path: PosixPath, model_type: str,
create_new_process_for_each_test=True,
))
@create_new_process_for_each_test()
def test_image_embedding_models_heavy(model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
def test_image_embedding_models_heavy(
model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_embedding_test(
model_test_info=model_test_info,
@@ -1083,12 +1084,13 @@ def test_image_embedding_models_heavy(model_type: str,
test_type=VLMTestType.VIDEO,
create_new_process_for_each_test=True,
))
def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
video_assets: VideoTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
def test_video_models_heavy(
model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
video_assets: VideoTestAssets,
):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_video_test(
model_test_info=model_test_info,
@@ -1106,12 +1108,13 @@ def test_video_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
test_type=VLMTestType.AUDIO,
create_new_process_for_each_test=True,
))
def test_audio_models_heavy(model_type: str, test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets, monkeypatch):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
def test_audio_models_heavy(
model_type: str,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_audio_test(
model_test_info=model_test_info,
@@ -1135,10 +1138,7 @@ def test_custom_inputs_models_heavy(
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
monkeypatch,
):
if model_type in REQUIRES_V0_MODELS:
monkeypatch.setenv("VLLM_USE_V1", "0")
model_test_info = VLM_TEST_SETTINGS[model_type]
runners.run_custom_inputs_test(
model_test_info=model_test_info,

View File

@@ -12,13 +12,12 @@ from mistral_common.tokens.tokenizers.mistral import MistralTokenizer
from mistral_common.tokens.tokenizers.multimodal import image_from_chunk
from transformers import AutoProcessor
from vllm import RequestOutput, SamplingParams, TextPrompt, TokensPrompt
from vllm import SamplingParams, TextPrompt, TokensPrompt
from vllm.multimodal import MultiModalDataBuiltins
from vllm.multimodal.inputs import PlaceholderRange
from vllm.sequence import Logprob, SampleLogprobs
from ....utils import VLLM_PATH, large_gpu_test
from ...utils import check_logprobs_close, dummy_hf_overrides
from ...utils import check_logprobs_close
if TYPE_CHECKING:
from _typeshed import StrPath
@@ -185,47 +184,3 @@ def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
outputs_1_lst=logprobs,
name_0="h100_ref",
name_1="output")
@pytest.mark.parametrize(
"image_urls,expected_ranges",
[(IMG_URLS[:1], [PlaceholderRange(offset=11, length=494)]),
(IMG_URLS[1:4], [
PlaceholderRange(offset=11, length=266),
PlaceholderRange(offset=277, length=1056),
PlaceholderRange(offset=1333, length=418)
])])
def test_multi_modal_placeholders(vllm_runner, image_urls: list[str],
expected_ranges: list[PlaceholderRange],
local_asset_server, monkeypatch) -> None:
local_image_urls = [local_asset_server.url_for(u) for u in image_urls]
prompt = _create_engine_inputs_hf(local_image_urls)
# This placeholder checking test only works with V0 engine
# where `multi_modal_placeholders` is returned with `RequestOutput`
monkeypatch.setenv("VLLM_USE_V1", "0")
with vllm_runner(
"mistral-community/pixtral-12b",
max_model_len=8192,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
load_format="dummy",
hf_overrides=dummy_hf_overrides,
) as vllm_model:
outputs = vllm_model.llm.generate(prompt)
assert len(outputs) == 1, f"{len(outputs)=}"
output: RequestOutput = outputs[0]
assert hasattr(output,
"multi_modal_placeholders"), f"{output.__dict__=}"
assert "image" in output.multi_modal_placeholders, \
f"{output.multi_modal_placeholders.keys()=}"
image_placeholder_ranges: list[
PlaceholderRange] = output.multi_modal_placeholders["image"]
assert len(image_placeholder_ranges) == len(
expected_ranges), f"{image_placeholder_ranges=}"
for real_range, expected_range in zip(image_placeholder_ranges,
expected_ranges):
assert real_range.offset == expected_range.offset, \
f"{real_range=} {expected_range=}"
assert real_range.length == expected_range.length, \
f"{real_range=} {expected_range=}"

View File

@@ -10,7 +10,6 @@ from PIL import Image
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
from vllm.utils import set_default_torch_num_threads
from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
PromptVideoInput, VllmRunner)
@@ -264,8 +263,7 @@ def run_embedding_input_test(
processor = AutoProcessor.from_pretrained(model)
# max_model_len should be greater than image_feature_size
with set_default_torch_num_threads(1):
vllm_model = vllm_runner(
with vllm_runner(
model,
runner="generate",
max_model_len=4000,
@@ -277,9 +275,8 @@ def run_embedding_input_test(
},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
)
with vllm_model:
default_torch_num_threads=1,
) as vllm_model:
outputs_per_case_for_original_input = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,