Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Helpers for building inputs that can be leveraged for different test types.
"""
"""Helpers for building inputs that can be leveraged for different test types."""
from collections.abc import Iterable
from pathlib import PosixPath
from typing import Callable, Optional, Union
@@ -10,20 +10,30 @@ import torch
from vllm.multimodal.audio import AudioResampler
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import (rescale_video_size, resize_video,
sample_frames_from_video)
from vllm.multimodal.video import (
rescale_video_size,
resize_video,
sample_frames_from_video,
)
from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
from .types import (SINGLE_AUDIO_BASE_PROMPT, SINGLE_IMAGE_BASE_PROMPTS,
TEST_AUDIO_PLACEHOLDER, TEST_IMG_PLACEHOLDER,
TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
ImageSizeWrapper, PromptWithMultiModalInput, SizeType,
VLMTestInfo)
from .types import (
SINGLE_AUDIO_BASE_PROMPT,
SINGLE_IMAGE_BASE_PROMPTS,
TEST_AUDIO_PLACEHOLDER,
TEST_IMG_PLACEHOLDER,
TEST_VIDEO_PLACEHOLDER,
VIDEO_BASE_PROMPT,
ImageSizeWrapper,
PromptWithMultiModalInput,
SizeType,
VLMTestInfo,
)
def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
str],
test_placeholder: str) -> str:
def replace_test_placeholder(
prompt: str, mm_idx_to_prompt: Callable[[int], str], test_placeholder: str
) -> str:
"""Given a prompt, replaces each test placeholder with the
model-specific tag.
"""
@@ -35,11 +45,13 @@ def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
return img_prompt
def get_model_prompts(base_prompts: Iterable[str],
img_idx_to_prompt: Optional[Callable[[int], str]],
video_idx_to_prompt: Optional[Callable[[int], str]],
audio_idx_to_prompt: Optional[Callable[[int], str]],
prompt_formatter: Callable[[str], str]) -> list[str]:
def get_model_prompts(
base_prompts: Iterable[str],
img_idx_to_prompt: Optional[Callable[[int], str]],
video_idx_to_prompt: Optional[Callable[[int], str]],
audio_idx_to_prompt: Optional[Callable[[int], str]],
prompt_formatter: Callable[[str], str],
) -> list[str]:
"""Given a model-agnostic base prompt and test configuration for a model(s)
to be tested, update the media placeholders and apply the prompt formatting
to get the test prompt string for this model.
@@ -56,19 +68,19 @@ def get_model_prompts(base_prompts: Iterable[str],
# Replace the multimodal placeholders in the base prompt with
# the correct ones for the model that we are testing
if img_idx_to_prompt:
base_prompt = replace_test_placeholder(base_prompt,
img_idx_to_prompt,
TEST_IMG_PLACEHOLDER)
base_prompt = replace_test_placeholder(
base_prompt, img_idx_to_prompt, TEST_IMG_PLACEHOLDER
)
if video_idx_to_prompt:
base_prompt = replace_test_placeholder(base_prompt,
video_idx_to_prompt,
TEST_VIDEO_PLACEHOLDER)
base_prompt = replace_test_placeholder(
base_prompt, video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER
)
if audio_idx_to_prompt:
base_prompt = replace_test_placeholder(base_prompt,
audio_idx_to_prompt,
TEST_AUDIO_PLACEHOLDER)
base_prompt = replace_test_placeholder(
base_prompt, audio_idx_to_prompt, TEST_AUDIO_PLACEHOLDER
)
# Apply the prompt formatter to wrap the base prompt with
# the correct media placeholders to get the model test prompt
@@ -84,14 +96,15 @@ def build_single_image_inputs_from_test_info(
tmp_path: Optional[PosixPath] = None,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError(
"Prompt formatter must be set to build single image inputs")
raise ValueError("Prompt formatter must be set to build single image inputs")
model_prompts = get_model_prompts(test_info.single_image_prompts,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter)
model_prompts = get_model_prompts(
test_info.single_image_prompts,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
# For models that require a local path / URL encoded in the image; export
# assets and encode into tmp_path for this test. This should be avoided
@@ -110,8 +123,8 @@ def build_single_image_inputs_from_test_info(
def build_single_image_inputs(
images, model_prompts,
size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
images, model_prompts, size_wrapper: ImageSizeWrapper
) -> list[PromptWithMultiModalInput]:
# For every image / prompt pair, get a pair containing two lists of
# length size_factors, where the first contains duplicates of the model
# prompt [str], and the second contains copies of the image after being
@@ -125,7 +138,8 @@ def build_single_image_inputs(
apply_image_size_scaling(image, size, size_wrapper.type)
for size in size_wrapper.data
],
) for image, prompt in zip(images, model_prompts)
)
for image, prompt in zip(images, model_prompts)
]
@@ -136,14 +150,15 @@ def build_multi_image_inputs_from_test_info(
tmp_path: Optional[PosixPath] = None,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError(
"Prompt formatter must be set to build multi image inputs")
raise ValueError("Prompt formatter must be set to build multi image inputs")
model_prompts = get_model_prompts([test_info.multi_image_prompt],
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter)
model_prompts = get_model_prompts(
[test_info.multi_image_prompt],
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
if test_info.prompt_path_encoder is not None:
if tmp_path is None:
@@ -164,16 +179,20 @@ def build_multi_image_inputs_from_test_info(
def build_multi_image_inputs(
image_lists, model_prompts,
size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
image_lists, model_prompts, size_wrapper: ImageSizeWrapper
) -> list[PromptWithMultiModalInput]:
return [
PromptWithMultiModalInput(
prompts=[prompt for _ in size_wrapper.data],
image_data=[[
apply_image_size_scaling(image, size, size_wrapper.type)
for image in images
] for size in size_wrapper.data],
) for images, prompt in zip(image_lists, model_prompts)
image_data=[
[
apply_image_size_scaling(image, size, size_wrapper.type)
for image in images
]
for size in size_wrapper.data
],
)
for images, prompt in zip(image_lists, model_prompts)
]
@@ -185,10 +204,10 @@ def build_embedding_inputs_from_test_info(
# These conditions will always be true if invoked through filtering,
# but we still check them in case this is ever called directly
if test_info.prompt_formatter is None:
raise ValueError(
"Prompt formatter must be set to build image embedding inputs")
if size_wrapper.type != SizeType.SIZE_FACTOR or not \
all(factor == 1.0 for factor in size_wrapper.data):
raise ValueError("Prompt formatter must be set to build image embedding inputs")
if size_wrapper.type != SizeType.SIZE_FACTOR or not all(
factor == 1.0 for factor in size_wrapper.data
):
raise ValueError("Embedding tests require constant (1.0) size factors")
if test_info.convert_assets_to_embeddings is None:
raise ValueError("No conversion func for getting embeddings found")
@@ -209,8 +228,7 @@ def build_embedding_inputs_from_test_info(
assert len(images) == len(model_prompts)
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
size_wrapper)
vllm_embeddings = build_single_image_inputs(embeds, model_prompts, size_wrapper)
return inputs, vllm_embeddings
@@ -235,21 +253,22 @@ def build_video_inputs_from_test_info(
for asset in video_assets
]
video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
else rescale_video_size)
video_scaler = (
resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size
)
return [
PromptWithMultiModalInput(
prompts=[prompt for _ in size_wrapper.data],
video_data=[
video_scaler(video, size) for size in size_wrapper.data
],
) for video, prompt in zip(sampled_vids, model_prompts)
video_data=[video_scaler(video, size) for size in size_wrapper.data],
)
for video, prompt in zip(sampled_vids, model_prompts)
]
def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
size_type: SizeType):
def apply_image_size_scaling(
image, size: Union[float, tuple[int, int]], size_type: SizeType
):
"""Applies a size scaler to one image; this can be an image size factor,
which scales the image while maintaining the aspect ratio"""
# Special case for embeddings; if it's a tensor, it's only valid if we
@@ -285,13 +304,16 @@ def build_audio_inputs_from_test_info(
method="librosa",
)
audios = [asset.audio_and_sample_rate for asset in audio_assets]
resampled_audios = [(
resampler.resample(
audio,
orig_sr=sr,
),
int(resampler.target_sr),
) for audio, sr in audios]
resampled_audios = [
(
resampler.resample(
audio,
orig_sr=sr,
),
int(resampler.target_sr),
)
for audio, sr in audios
]
return [
PromptWithMultiModalInput(

View File

@@ -4,19 +4,28 @@
modality, getting all combinations (similar to pytest's parametrization),
handling multimodal placeholder substitution, and so on.
"""
import itertools
from collections import OrderedDict
from collections.abc import Iterable
import pytest
from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
from .types import (
EMBEDDING_SIZE_FACTORS,
ExpandableVLMTestArgs,
ImageSizeWrapper,
SizeType,
VLMTestInfo,
VLMTestType,
)
def get_filtered_test_settings(
test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType,
new_proc_per_test: bool,
) -> dict[str, VLMTestInfo]:
"""Given the dict of potential test settings to run, return a subdict
of tests who have the current test type enabled with the matching val for
fork_per_test.
@@ -25,7 +34,8 @@ def get_filtered_test_settings(
def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
return test_info.test_type == test_type or (
isinstance(test_info.test_type, Iterable)
and test_type in test_info.test_type)
and test_type in test_info.test_type
)
matching_tests = {}
for test_name, test_info in test_settings.items():
@@ -36,62 +46,69 @@ def get_filtered_test_settings(
assert test_info.convert_assets_to_embeddings is not None
# Custom test inputs need to explicitly define the mm limit/inputs
if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
assert (test_info.custom_test_opts is not None
and isinstance(test_info.custom_test_opts, Iterable))
assert test_info.custom_test_opts is not None and isinstance(
test_info.custom_test_opts, Iterable
)
# For all types besides custom inputs, we need a prompt formatter
else:
assert test_info.prompt_formatter is not None
# Everything looks okay; keep if this is correct proc handling
if (test_info.distributed_executor_backend
is not None) == new_proc_per_test:
if (
test_info.distributed_executor_backend is not None
) == new_proc_per_test:
matching_tests[test_name] = test_info
return matching_tests
def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType,
create_new_process_for_each_test: bool):
def get_parametrized_options(
test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType,
create_new_process_for_each_test: bool,
):
"""Converts all of our VLMTestInfo into an expanded list of parameters.
This is similar to nesting pytest parametrize calls, but done directly
through an itertools product so that each test can set things like
size factors etc, while still running in isolated test cases.
"""
matching_tests = get_filtered_test_settings(
test_settings, test_type, create_new_process_for_each_test)
test_settings, test_type, create_new_process_for_each_test
)
# Ensure that something is wrapped as an iterable it's not already
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
# This is essentially the same as nesting a bunch of mark.parametrize
# decorators, but we do it programmatically to allow overrides for on
# a per-model basis, while still being able to execute each of these
# as individual test cases in pytest.
iter_kwargs = OrderedDict([
("model", ensure_wrapped(test_info.models)),
("max_tokens", ensure_wrapped(test_info.max_tokens)),
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
("dtype", ensure_wrapped(test_info.dtype)),
("distributed_executor_backend",
ensure_wrapped(test_info.distributed_executor_backend)),
])
iter_kwargs = OrderedDict(
[
("model", ensure_wrapped(test_info.models)),
("max_tokens", ensure_wrapped(test_info.max_tokens)),
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
("dtype", ensure_wrapped(test_info.dtype)),
(
"distributed_executor_backend",
ensure_wrapped(test_info.distributed_executor_backend),
),
]
)
# num_frames is video only
if test_type == VLMTestType.VIDEO:
iter_kwargs["num_video_frames"] = ensure_wrapped(
test_info.num_video_frames)
iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
# No sizes passed for custom inputs, since inputs are directly provided
if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
if wrapped_sizes is None:
raise ValueError(
f"Sizes must be set for test type {test_type}")
raise ValueError(f"Sizes must be set for test type {test_type}")
iter_kwargs["size_wrapper"] = wrapped_sizes
#Otherwise expand the custom test options instead
# Otherwise expand the custom test options instead
elif test_type == VLMTestType.CUSTOM_INPUTS:
if test_info.custom_test_opts is None:
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
@@ -121,8 +138,8 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
def get_wrapped_test_sizes(
test_info: VLMTestInfo,
test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]:
test_info: VLMTestInfo, test_type: VLMTestType
) -> tuple[ImageSizeWrapper, ...]:
"""Given a test info which may have size factors or fixed sizes, wrap them
and combine them into an iterable, each of which will be used in parameter
expansion.
@@ -133,18 +150,18 @@ def get_wrapped_test_sizes(
"""
# If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
if test_type == VLMTestType.EMBEDDING:
return tuple([
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
for factor in EMBEDDING_SIZE_FACTORS
])
return tuple(
[
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
for factor in EMBEDDING_SIZE_FACTORS
]
)
# Audio and Custom inputs have preprocessed inputs
elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
return tuple()
size_factors = test_info.image_size_factors \
if test_info.image_size_factors else []
fixed_sizes = test_info.image_sizes \
if test_info.image_sizes else []
size_factors = test_info.image_size_factors if test_info.image_size_factors else []
fixed_sizes = test_info.image_sizes if test_info.image_sizes else []
wrapped_factors = [
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
@@ -152,8 +169,7 @@ def get_wrapped_test_sizes(
]
wrapped_sizes = [
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
for size in fixed_sizes
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size) for size in fixed_sizes
]
return tuple(wrapped_factors + wrapped_sizes)

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Core test implementation to be shared across modalities."""
from typing import Any, Callable, Optional
import torch
@@ -70,22 +71,23 @@ def run_test(
if model_info.hf_overrides:
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
if model_info.skip_tokenizer_init:
vllm_runner_kwargs_[
"skip_tokenizer_init"] = model_info.skip_tokenizer_init
vllm_runner_kwargs_["skip_tokenizer_init"] = model_info.skip_tokenizer_init
if vllm_runner_kwargs:
vllm_runner_kwargs_.update(vllm_runner_kwargs)
with vllm_runner(model,
max_model_len=max_model_len,
max_num_seqs=max_num_seqs,
dtype=dtype,
limit_mm_per_prompt=limit_mm_per_prompt,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=enforce_eager,
runner=runner,
**vllm_runner_kwargs_) as vllm_model:
with vllm_runner(
model,
max_model_len=max_model_len,
max_num_seqs=max_num_seqs,
dtype=dtype,
limit_mm_per_prompt=limit_mm_per_prompt,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=enforce_eager,
runner=runner,
**vllm_runner_kwargs_,
) as vllm_model:
tokenizer = vllm_model.llm.get_tokenizer()
vllm_kwargs: dict[str, Any] = {}
@@ -95,21 +97,19 @@ def run_test(
vllm_kwargs["stop"] = stop_str
for prompts, image_data, video_data, audio_data in vllm_inputs:
mm_data = dict(images=image_data,
videos=video_data,
audios=audio_data)
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
vllm_output = vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
**vllm_kwargs_with_mm_data)
**vllm_kwargs_with_mm_data,
)
vllm_outputs_per_mm.append(vllm_output)
hf_model = hf_runner(model,
dtype=dtype,
auto_cls=auto_cls,
model_kwargs=hf_model_kwargs)
hf_model = hf_runner(
model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
)
# Some models need to patch things like the model processor, e.g., internvl
if patch_hf_runner is not None:
@@ -129,16 +129,15 @@ def run_test(
hf_kwargs["stop_strings"] = stop_str
for prompts, image_data, video_data, audio_data in inputs:
mm_data = dict(images=image_data,
videos=video_data,
audios=audio_data)
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
hf_kwargs_with_mm_data = hf_kwargs | mm_data
hf_output = hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
tokenizer=tokenizer,
**hf_kwargs_with_mm_data)
**hf_kwargs_with_mm_data,
)
hf_outputs_per_mm.append(hf_output)
# Apply output processing / sanitation to the vLLM and HF runner results
@@ -150,8 +149,7 @@ def run_test(
second_runner_processor=vllm_output_post_proc,
)
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
vllm_outputs_per_mm):
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm, vllm_outputs_per_mm):
# This is usually check_logprobs_close, but it's passed through to
# allow things like check_outputs_equal where needed
comparator(
@@ -171,15 +169,19 @@ def process_runner_outputs(
):
"""Applies the runner processor(s) to the runner outputs, if any."""
if first_runner_processor is not None:
first_runner_outputs = process_outputs(first_runner_processor, model,
first_runner_outputs)
first_runner_outputs = process_outputs(
first_runner_processor, model, first_runner_outputs
)
if second_runner_processor is not None:
second_runner_outputs = process_outputs(second_runner_processor, model,
second_runner_outputs)
second_runner_outputs = process_outputs(
second_runner_processor, model, second_runner_outputs
)
return first_runner_outputs, second_runner_outputs
def process_outputs(output_processor, model, outputs_per_image):
"""Applies a model specific post-processor function to a runner's output"""
return [[output_processor(res, model) for res in outputs]
for outputs in outputs_per_image]
return [
[output_processor(res, model) for res in outputs]
for outputs in outputs_per_image
]

View File

@@ -1,12 +1,16 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Custom input builders for edge-cases in different models."""
from typing import Callable
from vllm.assets.image import ImageAsset
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import (rescale_video_size, resize_video,
sample_frames_from_video)
from vllm.multimodal.video import (
rescale_video_size,
resize_video,
sample_frames_from_video,
)
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
from .builders import build_multi_image_inputs, build_single_image_inputs
@@ -15,7 +19,7 @@ from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType
def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
"""Builds inputs for multi-image (varied sizes/aspect ratio) testing.
Args:
formatter: model-specific prompt formatter.
"""
@@ -41,7 +45,7 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
stop_sign,
rescale_image_size(stop_sign, 0.25),
cherry_blossom.resize((183, 488)),
cherry_blossom.resize((488, 183))
cherry_blossom.resize((488, 183)),
],
cherry_blossom,
]
@@ -54,10 +58,11 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
]
def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
num_frames: int = 16):
def multi_video_multi_aspect_ratio_inputs(
formatter: Callable[[str], str], num_frames: int = 16
):
"""Builds inputs for multi-video (varied sizes/aspect ratio) testing.
Args:
formatter: model-specific prompt formatter.
"""
@@ -81,7 +86,7 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
video,
rescale_video_size(video, 0.25),
resize_video(video, (183, 488)),
resize_video(video, (488, 183))
resize_video(video, (488, 183)),
],
video,
]
@@ -96,7 +101,9 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
def different_patch_input_cases_internvl():
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
formatter = (
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"
) # noqa: E501
single_img_prompts = [
"<image>\nWhat's the content in the center of the image?",
"<image>\nWhat is the season?",
@@ -115,14 +122,14 @@ def different_patch_input_cases_internvl():
def windows_attention_image_qwen2_5_vl():
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
image = ImageAsset("hato").pil_image
question = "Describe the image."
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n")
prompt = (
f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n<|im_start|>assistant\n"
)
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
return build_single_image_inputs([image], [prompt], wrapped_sf)
@@ -136,8 +143,9 @@ def video_with_metadata_glm4_1v():
formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
scales = [0.1, 0.2, 0.25]
video_input = [[(rescale_video_size(video_array, scale), metadata)]
for scale in scales]
video_input = [
[(rescale_video_size(video_array, scale), metadata)] for scale in scales
]
prompts = [formatted_prompt] * len(video_input)
return [

View File

@@ -4,6 +4,7 @@
for manipulating the input / output of HF & vLLM test runners, which are
typically specific to a small subset of models.
"""
import types
from pathlib import PosixPath
from typing import Optional, Union
@@ -15,8 +16,13 @@ import pytest
import regex as re
import torch
from PIL.Image import Image
from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
GenerationConfig, GenerationMixin)
from transformers import (
AutoConfig,
AutoTokenizer,
BatchFeature,
GenerationConfig,
GenerationMixin,
)
from transformers.video_utils import VideoMetadata
from vllm.logprobs import SampleLogprobs
@@ -27,8 +33,7 @@ from .types import RunnerOutput
####### vLLM output processors functions
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
"""Sanitize vllm output [blip2 models] to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
@@ -42,8 +47,7 @@ def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
return hf_output_ids, hf_output_str, out_logprobs
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
"""Sanitize vllm output [fuyu models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@@ -53,8 +57,8 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
def qwen_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
vllm_output: RunnerOutput, model: str
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
"""Sanitize vllm output [qwen models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@@ -64,8 +68,8 @@ def qwen_vllm_to_hf_output(
def qwen2_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
vllm_output: RunnerOutput, model: str
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
"""Sanitize vllm output [qwen2 models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@@ -75,8 +79,8 @@ def qwen2_vllm_to_hf_output(
def kimiv_vl_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
vllm_output: RunnerOutput, model: str
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
"""Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@@ -85,23 +89,25 @@ def kimiv_vl_vllm_to_hf_output(
return output_ids, hf_output_str, out_logprobs
def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def llava_image_vllm_to_hf_output(
vllm_output: RunnerOutput, model: str
) -> RunnerOutput:
config = AutoConfig.from_pretrained(model)
mm_token_id = config.image_token_index
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
def llava_video_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
vllm_output: RunnerOutput, model: str
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
config = AutoConfig.from_pretrained(model)
mm_token_id = config.video_token_index
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
mm_token_id: int) -> RunnerOutput:
def _llava_vllm_to_hf_output(
vllm_output: RunnerOutput, model: str, mm_token_id: int
) -> RunnerOutput:
"""Sanitize vllm output [Llava models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@@ -109,7 +115,8 @@ def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
eos_token_id = tokenizer.eos_token_id
hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids)
token_id
for idx, token_id in enumerate(output_ids)
if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
]
@@ -128,8 +135,9 @@ def llava_onevision_hf_model_kwargs(model: str) -> dict:
return config.to_dict()
def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def llava_onevision_vllm_to_hf_output(
vllm_output: RunnerOutput, model: str
) -> RunnerOutput:
"""Sanitize vllm output [llava-onevision] to compare with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@@ -140,7 +148,8 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
eos_token_id = tokenizer.eos_token_id
hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids)
token_id
for idx, token_id in enumerate(output_ids)
if token_id != video_token_id or output_ids[idx - 1] != video_token_id
]
@@ -151,8 +160,7 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
return hf_output_ids, hf_output_str, out_logprobs
def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def mantis_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
"""Sanitize vllm output [mantis] to compare with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@@ -161,8 +169,7 @@ def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
return output_ids, hf_output_str, out_logprobs
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
"""Sanitize vllm output [phi3v] to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
@@ -180,8 +187,7 @@ def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
return hf_output_ids, hf_output_str, out_logprobs
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
"""Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@@ -192,7 +198,8 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
eos_token_id = tokenizer.eos_token_id
hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids)
token_id
for idx, token_id in enumerate(output_ids)
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
]
@@ -205,46 +212,40 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
####### Post-processors for HF outputs
def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<end▁of▁sentence>"):
output_str = output_str.split("<end▁of▁sentence>")[0]
return output_ids, output_str, out_logprobs
def idefics3_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def idefics3_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<end_of_utterance>"):
output_str = output_str.split("<end_of_utterance>")[0]
return output_ids, output_str, out_logprobs
def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def smolvlm_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
# Based on Idefics3
return idefics3_trunc_hf_output(hf_output, model)
def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<|eot_id|>"):
output_str = output_str.split("<|eot_id|>")[0]
return output_ids, output_str, out_logprobs
def minimax_vl_01_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def minimax_vl_01_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<end_of_sentence>"):
output_str = output_str.split("<end_of_sentence>")[0]
return output_ids, output_str, out_logprobs
def ultravox_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def ultravox_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
tokenizer = AutoTokenizer.from_pretrained(model)
@@ -262,8 +263,8 @@ def get_llava_embeddings(image_assets: ImageTestAssets):
####### Prompt path encoders for models that need models on disk
def qwen_prompt_path_encoder(
tmp_path: PosixPath, prompt: str,
assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], ImageTestAssets]
) -> str:
"""Given a temporary dir path, export one or more image assets into the
tempdir & replace its contents with the local path to the string so that
the HF version of Qwen-VL can resolve the path and load the image in its
@@ -313,8 +314,9 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return BatchFeature(data=inputs, tensor_type="pt")
hf_model.processor = processor
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language.model.embed_tokens
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.language.model.embed_tokens
)
return hf_model
@@ -357,11 +359,10 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
assert len(contents) == len(images)
return hf_processor.apply_chat_template(
[{
"role": "user",
"image": image,
"content": content
} for image, content in zip(images, contents)],
[
{"role": "user", "image": image, "content": content}
for image, content in zip(images, contents)
],
add_generation_prompt=True,
tokenize=True,
return_dict=True,
@@ -369,8 +370,9 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
)
hf_model.processor = processor
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.transformer.output_layer
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.transformer.output_layer
)
return hf_model
@@ -387,10 +389,9 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
else:
video_metadata = None
return hf_processor(*args,
videos=videos,
video_metadata=video_metadata,
**kwargs)
return hf_processor(
*args, videos=videos, video_metadata=video_metadata, **kwargs
)
hf_model.processor = processor
return hf_model
@@ -406,8 +407,9 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.num_image_token = hf_runner.model.num_image_token
self.tokenizer = hf_runner.tokenizer
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
trust_remote_code=True)
self.config = AutoConfig.from_pretrained(
hf_runner.model_name, trust_remote_code=True
)
self.vision_config = self.config.vision_config
self.use_thumbnail = self.config.use_thumbnail
self.use_msac = self.config.use_msac
@@ -415,11 +417,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.max_num = self.config.max_dynamic_patch
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Union[Image, list[Image]],
**kwargs):
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
# yapf: disable
from vllm.model_executor.models.h2ovl import (
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
IMG_CONTEXT,
IMG_END,
IMG_START,
image_to_pixel_values_h2ovl,
)
# yapf: enable
images = [images] if isinstance(images, Image) else images
@@ -431,29 +436,26 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
max_num=self.max_num,
use_thumbnail=self.use_thumbnail,
use_msac=self.use_msac,
) for image in images
]
num_patches_list = [
pixel_value.shape[0] for pixel_value in pixel_values
)
for image in images
]
num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
pixel_values = torch.cat(pixel_values, dim=0)
for num_patches in num_patches_list:
context_tokens = IMG_CONTEXT * self.num_image_token \
* num_patches
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
image_tokens = IMG_START + context_tokens + IMG_END
text = text.replace('<image>', image_tokens, 1)
text = text.replace("<image>", image_tokens, 1)
prompt = self.tokenizer(text, return_tensors="pt")
prompt.update({"pixel_values": pixel_values})
return prompt
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
"<IMG_CONTEXT>")
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
hf_model.model.img_context_token_id = img_context_token_id
hf_model.processor = H2OVLProcessor(hf_model)
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language_model.get_output_embeddings()
hf_model.model.generate = types.MethodType(_internvl_generate,
hf_model.model)
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.language_model.get_output_embeddings()
)
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
return hf_model
@@ -467,19 +469,23 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.num_image_token = hf_runner.model.num_image_token
self.tokenizer = hf_runner.tokenizer
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
trust_remote_code=True)
self.config = AutoConfig.from_pretrained(
hf_runner.model_name, trust_remote_code=True
)
self.vision_config = self.config.vision_config
self.use_thumbnail = self.config.use_thumbnail
self.min_num = self.config.min_dynamic_patch
self.max_num = self.config.max_dynamic_patch
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Union[Image, list[Image]],
**kwargs):
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
from vllm.model_executor.models.skyworkr1v import (
IMG_CONTEXT, IMG_END, IMG_START,
image_to_pixel_values_skyworkr1v)
IMG_CONTEXT,
IMG_END,
IMG_START,
image_to_pixel_values_skyworkr1v,
)
images = [images] if isinstance(images, Image) else images
pixel_values = [
image_to_pixel_values_skyworkr1v(
@@ -488,29 +494,26 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
min_num=self.min_num,
max_num=self.max_num,
use_thumbnail=self.use_thumbnail,
) for image in images
]
num_patches_list = [
pixel_value.shape[0] for pixel_value in pixel_values
)
for image in images
]
num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
pixel_values = torch.cat(pixel_values, dim=0)
for num_patches in num_patches_list:
context_tokens = IMG_CONTEXT * self.num_image_token \
* num_patches
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
image_tokens = IMG_START + context_tokens + IMG_END
text = text.replace('<image>', image_tokens, 1)
text = text.replace("<image>", image_tokens, 1)
prompt = self.tokenizer(text, return_tensors="pt")
prompt.update({"pixel_values": pixel_values})
return prompt
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
"<IMG_CONTEXT>")
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
hf_model.model.img_context_token_id = img_context_token_id
hf_model.processor = SkyworkR1VProcessor(hf_model)
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language_model.get_output_embeddings()
hf_model.model.generate = types.MethodType(_internvl_generate,
hf_model.model)
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.language_model.get_output_embeddings()
)
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
return hf_model
@@ -524,8 +527,9 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.num_image_token = hf_runner.model.num_image_token
self.tokenizer = hf_runner.tokenizer
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
trust_remote_code=True)
self.config = AutoConfig.from_pretrained(
hf_runner.model_name, trust_remote_code=True
)
self.vision_config = self.config.vision_config
self.use_thumbnail = self.config.use_thumbnail
self.min_num = self.config.min_dynamic_patch
@@ -540,8 +544,13 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
**kwargs,
):
from vllm.model_executor.models.internvl import (
IMG_CONTEXT, IMG_END, IMG_START,
image_to_pixel_values_internvl, video_to_pixel_values_internvl)
IMG_CONTEXT,
IMG_END,
IMG_START,
image_to_pixel_values_internvl,
video_to_pixel_values_internvl,
)
images = [images] if isinstance(images, Image) else images
videos = [videos] if isinstance(videos, np.ndarray) else videos
if images is not None:
@@ -552,7 +561,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
min_num=self.min_num,
max_num=self.max_num,
use_thumbnail=self.use_thumbnail,
) for image in images
)
for image in images
]
num_patches_images = [
pixel_value.shape[0] for pixel_value in pixel_values_images
@@ -568,7 +578,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
min_num=1,
max_num=1,
use_thumbnail=False,
) for video in videos
)
for video in videos
]
num_patches_videos = [
pixel_value.shape[0] for pixel_value in pixel_values_videos
@@ -580,38 +591,37 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
while ("<image>" in text) or ("<video>" in text):
image_index = text.find("<image>")
video_index = text.find("<video>")
if image_index == -1 or (video_index > -1
and video_index < image_index):
if image_index == -1 or (
video_index > -1 and video_index < image_index
):
num_patches = num_patches_videos.pop(0)
pixel_values.append(pixel_values_videos.pop(0))
context_tokens = IMG_START + \
IMG_CONTEXT * self.num_image_token + IMG_END
video_tokens = ''.join([
f'Frame{i+1}: {context_tokens}'
for i in range(num_patches)
])
text = text.replace('<video>', video_tokens, 1)
context_tokens = (
IMG_START + IMG_CONTEXT * self.num_image_token + IMG_END
)
video_tokens = "".join(
[f"Frame{i + 1}: {context_tokens}" for i in range(num_patches)]
)
text = text.replace("<video>", video_tokens, 1)
else:
num_patches = num_patches_images.pop(0)
pixel_values.append(pixel_values_images.pop(0))
context_tokens = IMG_CONTEXT * self.num_image_token \
* num_patches
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
image_tokens = IMG_START + context_tokens + IMG_END
text = text.replace('<image>', image_tokens, 1)
text = text.replace("<image>", image_tokens, 1)
pixel_values = torch.cat(pixel_values, dim=0)
prompt = self.tokenizer(text, return_tensors="pt")
prompt.update({"pixel_values": pixel_values})
return prompt
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
"<IMG_CONTEXT>")
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
hf_model.model.img_context_token_id = img_context_token_id
hf_model.processor = InternVLProcessor(hf_model)
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language_model.get_output_embeddings()
hf_model.model.generate = types.MethodType(_internvl_generate,
hf_model.model)
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.language_model.get_output_embeddings()
)
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
return hf_model
@@ -631,7 +641,7 @@ def _internvl_generate(
input_embeds = input_embeds.reshape(B * N, C)
input_ids = input_ids.reshape(B * N)
selected = (input_ids == self.img_context_token_id)
selected = input_ids == self.img_context_token_id
assert selected.sum() != 0
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
@@ -778,8 +788,9 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.llm.get_output_embeddings()
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.llm.get_output_embeddings()
)
def processor(*args, text="", images=None, **kwargs):
text_tokenizer = hf_model.model.get_text_tokenizer()
@@ -787,8 +798,7 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
prompt_start_and_end = {
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
"llama":
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
"llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
}
for start, end in prompt_start_and_end.values():
@@ -797,7 +807,8 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
break
prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
text_or_conversations=text, images=images)
text_or_conversations=text, images=images
)
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
inputs = {
@@ -813,8 +824,9 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.llm.get_output_embeddings()
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.llm.get_output_embeddings()
)
def processor(*args, text="", images=None, videos=None, **kwargs):
if images is None:
@@ -825,13 +837,11 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
videos = []
else:
videos = [videos] if isinstance(videos, np.ndarray) else videos
videos = [[PIL.Image.fromarray(frame) for frame in vid]
for vid in videos]
videos = [[PIL.Image.fromarray(frame) for frame in vid] for vid in videos]
prompt_start_and_end = {
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
"llama":
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
"llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
}
for start, end in prompt_start_and_end.values():
@@ -842,21 +852,20 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
images_message = [{"type": "image", "image": img} for img in images]
videos_message = [{"type": "video", "video": vid} for vid in videos]
messages = [{
"role":
"user",
"content": [
*images_message,
*videos_message,
{
"type": "text",
"text": text
},
],
}]
messages = [
{
"role": "user",
"content": [
*images_message,
*videos_message,
{"type": "text", "text": text},
],
}
]
input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
messages=messages, enable_thinking=True)
messages=messages, enable_thinking=True
)
inputs = {
"inputs": input_ids,
"pixel_values": pixel_values,

View File

@@ -3,23 +3,34 @@
"""Entrypoints for wrapping the core run_test implementation for specific test
types / modalities.
"""
from pathlib import PosixPath
from .....conftest import (AudioTestAssets, HfRunner, ImageTestAssets,
VideoTestAssets, VllmRunner)
from .....conftest import (
AudioTestAssets,
HfRunner,
ImageTestAssets,
VideoTestAssets,
VllmRunner,
)
from . import builders, core
from .types import ExpandableVLMTestArgs, VLMTestInfo
####### Entrypoints for running different test types
def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets):
def run_single_image_test(
*,
tmp_path: PosixPath,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs = builders.build_single_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
model_test_info, image_assets, test_case.size_wrapper, tmp_path
)
core.run_test(
hf_runner=hf_runner,
@@ -31,17 +42,23 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": 1},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets):
def run_multi_image_test(
*,
tmp_path: PosixPath,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs = builders.build_multi_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
model_test_info, image_assets, test_case.size_wrapper, tmp_path
)
core.run_test(
hf_runner=hf_runner,
@@ -53,17 +70,22 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": len(image_assets)},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_embedding_test(*, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets):
def run_embedding_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper)
model_test_info, image_assets, test_case.size_wrapper
)
core.run_test(
hf_runner=hf_runner,
@@ -76,7 +98,8 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
limit_mm_per_prompt={"image": 1},
vllm_embeddings=vllm_embeddings,
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_video_test(
@@ -90,8 +113,11 @@ def run_video_test(
assert test_case.size_wrapper is not None
assert test_case.num_video_frames is not None
inputs = builders.build_video_inputs_from_test_info(
model_test_info, video_assets, test_case.size_wrapper,
test_case.num_video_frames)
model_test_info,
video_assets,
test_case.size_wrapper,
test_case.num_video_frames,
)
core.run_test(
hf_runner=hf_runner,
@@ -103,7 +129,8 @@ def run_video_test(
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"video": len(video_assets)},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_audio_test(
@@ -114,8 +141,7 @@ def run_audio_test(
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
inputs = builders.build_audio_inputs_from_test_info(
model_test_info, audio_assets)
inputs = builders.build_audio_inputs_from_test_info(model_test_info, audio_assets)
core.run_test(
hf_runner=hf_runner,
@@ -127,13 +153,17 @@ def run_audio_test(
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"audio": 1},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner]):
def run_custom_inputs_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
):
# Custom test cases can provide inputs directly, but they need to
# explicitly provided a CustomTestConfig, which wraps the inputs and
# the limit_mm_per_prompt
@@ -155,4 +185,5 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt=limit_mm_per_prompt,
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Types for writing multimodal model tests."""
from collections.abc import Iterable
from enum import Enum
from pathlib import PosixPath
@@ -15,9 +16,16 @@ from vllm.config import RunnerOption
from vllm.logprobs import SampleLogprobs
from vllm.transformers_utils.tokenizer import AnyTokenizer
from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset,
ImageTestAssets, PromptAudioInput, PromptImageInput,
PromptVideoInput)
from .....conftest import (
AUDIO_ASSETS,
IMAGE_ASSETS,
HfRunner,
ImageAsset,
ImageTestAssets,
PromptAudioInput,
PromptImageInput,
PromptVideoInput,
)
from ....utils import check_logprobs_close
# meta image tag; will be replaced by the appropriate tag for the model
@@ -47,6 +55,7 @@ RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
class PromptWithMultiModalInput(NamedTuple):
"""Holds the multimodal input for a single test case."""
prompts: list[str]
image_data: Optional[PromptImageInput] = None
video_data: Optional[PromptVideoInput] = None
@@ -100,8 +109,9 @@ class VLMTestInfo(NamedTuple):
# Function for converting ImageAssets to image embeddings;
# We need to define this explicitly for embedding tests
convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
list[torch.Tensor]]] = None
convert_assets_to_embeddings: Optional[
Callable[[ImageTestAssets], list[torch.Tensor]]
] = None
# Exposed options for vLLM runner; we change these in a several tests,
# but the defaults are derived from VllmRunner & the engine defaults
@@ -156,8 +166,8 @@ class VLMTestInfo(NamedTuple):
# for Qwen-VL, which requires encoding the image path / url into the prompt
# for HF runner
prompt_path_encoder: Optional[
Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]],
str]] = None # noqa: E501
Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]], str]
] = None # noqa: E501
# Allows configuring a test to run with custom inputs
custom_test_opts: Optional[list[CustomTestOptions]] = None
@@ -190,6 +200,7 @@ class VLMTestInfo(NamedTuple):
class ExpandableVLMTestArgs(NamedTuple):
"""The expanded kwargs which correspond to a single test case."""
model: str
max_tokens: int
num_logprobs: int