Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Helpers for building inputs that can be leveraged for different test types.
|
||||
"""
|
||||
"""Helpers for building inputs that can be leveraged for different test types."""
|
||||
|
||||
from collections.abc import Iterable
|
||||
from pathlib import PosixPath
|
||||
from typing import Callable, Optional, Union
|
||||
@@ -10,20 +10,30 @@ import torch
|
||||
|
||||
from vllm.multimodal.audio import AudioResampler
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import (rescale_video_size, resize_video,
|
||||
sample_frames_from_video)
|
||||
from vllm.multimodal.video import (
|
||||
rescale_video_size,
|
||||
resize_video,
|
||||
sample_frames_from_video,
|
||||
)
|
||||
|
||||
from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
|
||||
from .types import (SINGLE_AUDIO_BASE_PROMPT, SINGLE_IMAGE_BASE_PROMPTS,
|
||||
TEST_AUDIO_PLACEHOLDER, TEST_IMG_PLACEHOLDER,
|
||||
TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
|
||||
ImageSizeWrapper, PromptWithMultiModalInput, SizeType,
|
||||
VLMTestInfo)
|
||||
from .types import (
|
||||
SINGLE_AUDIO_BASE_PROMPT,
|
||||
SINGLE_IMAGE_BASE_PROMPTS,
|
||||
TEST_AUDIO_PLACEHOLDER,
|
||||
TEST_IMG_PLACEHOLDER,
|
||||
TEST_VIDEO_PLACEHOLDER,
|
||||
VIDEO_BASE_PROMPT,
|
||||
ImageSizeWrapper,
|
||||
PromptWithMultiModalInput,
|
||||
SizeType,
|
||||
VLMTestInfo,
|
||||
)
|
||||
|
||||
|
||||
def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
|
||||
str],
|
||||
test_placeholder: str) -> str:
|
||||
def replace_test_placeholder(
|
||||
prompt: str, mm_idx_to_prompt: Callable[[int], str], test_placeholder: str
|
||||
) -> str:
|
||||
"""Given a prompt, replaces each test placeholder with the
|
||||
model-specific tag.
|
||||
"""
|
||||
@@ -35,11 +45,13 @@ def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
|
||||
return img_prompt
|
||||
|
||||
|
||||
def get_model_prompts(base_prompts: Iterable[str],
|
||||
img_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
video_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
audio_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
prompt_formatter: Callable[[str], str]) -> list[str]:
|
||||
def get_model_prompts(
|
||||
base_prompts: Iterable[str],
|
||||
img_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
video_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
audio_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
prompt_formatter: Callable[[str], str],
|
||||
) -> list[str]:
|
||||
"""Given a model-agnostic base prompt and test configuration for a model(s)
|
||||
to be tested, update the media placeholders and apply the prompt formatting
|
||||
to get the test prompt string for this model.
|
||||
@@ -56,19 +68,19 @@ def get_model_prompts(base_prompts: Iterable[str],
|
||||
# Replace the multimodal placeholders in the base prompt with
|
||||
# the correct ones for the model that we are testing
|
||||
if img_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(base_prompt,
|
||||
img_idx_to_prompt,
|
||||
TEST_IMG_PLACEHOLDER)
|
||||
base_prompt = replace_test_placeholder(
|
||||
base_prompt, img_idx_to_prompt, TEST_IMG_PLACEHOLDER
|
||||
)
|
||||
|
||||
if video_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(base_prompt,
|
||||
video_idx_to_prompt,
|
||||
TEST_VIDEO_PLACEHOLDER)
|
||||
base_prompt = replace_test_placeholder(
|
||||
base_prompt, video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER
|
||||
)
|
||||
|
||||
if audio_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(base_prompt,
|
||||
audio_idx_to_prompt,
|
||||
TEST_AUDIO_PLACEHOLDER)
|
||||
base_prompt = replace_test_placeholder(
|
||||
base_prompt, audio_idx_to_prompt, TEST_AUDIO_PLACEHOLDER
|
||||
)
|
||||
|
||||
# Apply the prompt formatter to wrap the base prompt with
|
||||
# the correct media placeholders to get the model test prompt
|
||||
@@ -84,14 +96,15 @@ def build_single_image_inputs_from_test_info(
|
||||
tmp_path: Optional[PosixPath] = None,
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError(
|
||||
"Prompt formatter must be set to build single image inputs")
|
||||
raise ValueError("Prompt formatter must be set to build single image inputs")
|
||||
|
||||
model_prompts = get_model_prompts(test_info.single_image_prompts,
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter)
|
||||
model_prompts = get_model_prompts(
|
||||
test_info.single_image_prompts,
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
# For models that require a local path / URL encoded in the image; export
|
||||
# assets and encode into tmp_path for this test. This should be avoided
|
||||
@@ -110,8 +123,8 @@ def build_single_image_inputs_from_test_info(
|
||||
|
||||
|
||||
def build_single_image_inputs(
|
||||
images, model_prompts,
|
||||
size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
|
||||
images, model_prompts, size_wrapper: ImageSizeWrapper
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
# For every image / prompt pair, get a pair containing two lists of
|
||||
# length size_factors, where the first contains duplicates of the model
|
||||
# prompt [str], and the second contains copies of the image after being
|
||||
@@ -125,7 +138,8 @@ def build_single_image_inputs(
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for size in size_wrapper.data
|
||||
],
|
||||
) for image, prompt in zip(images, model_prompts)
|
||||
)
|
||||
for image, prompt in zip(images, model_prompts)
|
||||
]
|
||||
|
||||
|
||||
@@ -136,14 +150,15 @@ def build_multi_image_inputs_from_test_info(
|
||||
tmp_path: Optional[PosixPath] = None,
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError(
|
||||
"Prompt formatter must be set to build multi image inputs")
|
||||
raise ValueError("Prompt formatter must be set to build multi image inputs")
|
||||
|
||||
model_prompts = get_model_prompts([test_info.multi_image_prompt],
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter)
|
||||
model_prompts = get_model_prompts(
|
||||
[test_info.multi_image_prompt],
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
if test_info.prompt_path_encoder is not None:
|
||||
if tmp_path is None:
|
||||
@@ -164,16 +179,20 @@ def build_multi_image_inputs_from_test_info(
|
||||
|
||||
|
||||
def build_multi_image_inputs(
|
||||
image_lists, model_prompts,
|
||||
size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
|
||||
image_lists, model_prompts, size_wrapper: ImageSizeWrapper
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=[prompt for _ in size_wrapper.data],
|
||||
image_data=[[
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for image in images
|
||||
] for size in size_wrapper.data],
|
||||
) for images, prompt in zip(image_lists, model_prompts)
|
||||
image_data=[
|
||||
[
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for image in images
|
||||
]
|
||||
for size in size_wrapper.data
|
||||
],
|
||||
)
|
||||
for images, prompt in zip(image_lists, model_prompts)
|
||||
]
|
||||
|
||||
|
||||
@@ -185,10 +204,10 @@ def build_embedding_inputs_from_test_info(
|
||||
# These conditions will always be true if invoked through filtering,
|
||||
# but we still check them in case this is ever called directly
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError(
|
||||
"Prompt formatter must be set to build image embedding inputs")
|
||||
if size_wrapper.type != SizeType.SIZE_FACTOR or not \
|
||||
all(factor == 1.0 for factor in size_wrapper.data):
|
||||
raise ValueError("Prompt formatter must be set to build image embedding inputs")
|
||||
if size_wrapper.type != SizeType.SIZE_FACTOR or not all(
|
||||
factor == 1.0 for factor in size_wrapper.data
|
||||
):
|
||||
raise ValueError("Embedding tests require constant (1.0) size factors")
|
||||
if test_info.convert_assets_to_embeddings is None:
|
||||
raise ValueError("No conversion func for getting embeddings found")
|
||||
@@ -209,8 +228,7 @@ def build_embedding_inputs_from_test_info(
|
||||
assert len(images) == len(model_prompts)
|
||||
|
||||
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
|
||||
vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
|
||||
size_wrapper)
|
||||
vllm_embeddings = build_single_image_inputs(embeds, model_prompts, size_wrapper)
|
||||
return inputs, vllm_embeddings
|
||||
|
||||
|
||||
@@ -235,21 +253,22 @@ def build_video_inputs_from_test_info(
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
|
||||
else rescale_video_size)
|
||||
video_scaler = (
|
||||
resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size
|
||||
)
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=[prompt for _ in size_wrapper.data],
|
||||
video_data=[
|
||||
video_scaler(video, size) for size in size_wrapper.data
|
||||
],
|
||||
) for video, prompt in zip(sampled_vids, model_prompts)
|
||||
video_data=[video_scaler(video, size) for size in size_wrapper.data],
|
||||
)
|
||||
for video, prompt in zip(sampled_vids, model_prompts)
|
||||
]
|
||||
|
||||
|
||||
def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
|
||||
size_type: SizeType):
|
||||
def apply_image_size_scaling(
|
||||
image, size: Union[float, tuple[int, int]], size_type: SizeType
|
||||
):
|
||||
"""Applies a size scaler to one image; this can be an image size factor,
|
||||
which scales the image while maintaining the aspect ratio"""
|
||||
# Special case for embeddings; if it's a tensor, it's only valid if we
|
||||
@@ -285,13 +304,16 @@ def build_audio_inputs_from_test_info(
|
||||
method="librosa",
|
||||
)
|
||||
audios = [asset.audio_and_sample_rate for asset in audio_assets]
|
||||
resampled_audios = [(
|
||||
resampler.resample(
|
||||
audio,
|
||||
orig_sr=sr,
|
||||
),
|
||||
int(resampler.target_sr),
|
||||
) for audio, sr in audios]
|
||||
resampled_audios = [
|
||||
(
|
||||
resampler.resample(
|
||||
audio,
|
||||
orig_sr=sr,
|
||||
),
|
||||
int(resampler.target_sr),
|
||||
)
|
||||
for audio, sr in audios
|
||||
]
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
|
||||
@@ -4,19 +4,28 @@
|
||||
modality, getting all combinations (similar to pytest's parametrization),
|
||||
handling multimodal placeholder substitution, and so on.
|
||||
"""
|
||||
|
||||
import itertools
|
||||
from collections import OrderedDict
|
||||
from collections.abc import Iterable
|
||||
|
||||
import pytest
|
||||
|
||||
from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
|
||||
ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
|
||||
from .types import (
|
||||
EMBEDDING_SIZE_FACTORS,
|
||||
ExpandableVLMTestArgs,
|
||||
ImageSizeWrapper,
|
||||
SizeType,
|
||||
VLMTestInfo,
|
||||
VLMTestType,
|
||||
)
|
||||
|
||||
|
||||
def get_filtered_test_settings(
|
||||
test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
|
||||
new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
|
||||
test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
new_proc_per_test: bool,
|
||||
) -> dict[str, VLMTestInfo]:
|
||||
"""Given the dict of potential test settings to run, return a subdict
|
||||
of tests who have the current test type enabled with the matching val for
|
||||
fork_per_test.
|
||||
@@ -25,7 +34,8 @@ def get_filtered_test_settings(
|
||||
def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
|
||||
return test_info.test_type == test_type or (
|
||||
isinstance(test_info.test_type, Iterable)
|
||||
and test_type in test_info.test_type)
|
||||
and test_type in test_info.test_type
|
||||
)
|
||||
|
||||
matching_tests = {}
|
||||
for test_name, test_info in test_settings.items():
|
||||
@@ -36,62 +46,69 @@ def get_filtered_test_settings(
|
||||
assert test_info.convert_assets_to_embeddings is not None
|
||||
# Custom test inputs need to explicitly define the mm limit/inputs
|
||||
if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
|
||||
assert (test_info.custom_test_opts is not None
|
||||
and isinstance(test_info.custom_test_opts, Iterable))
|
||||
assert test_info.custom_test_opts is not None and isinstance(
|
||||
test_info.custom_test_opts, Iterable
|
||||
)
|
||||
# For all types besides custom inputs, we need a prompt formatter
|
||||
else:
|
||||
assert test_info.prompt_formatter is not None
|
||||
|
||||
# Everything looks okay; keep if this is correct proc handling
|
||||
if (test_info.distributed_executor_backend
|
||||
is not None) == new_proc_per_test:
|
||||
if (
|
||||
test_info.distributed_executor_backend is not None
|
||||
) == new_proc_per_test:
|
||||
matching_tests[test_name] = test_info
|
||||
|
||||
return matching_tests
|
||||
|
||||
|
||||
def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
create_new_process_for_each_test: bool):
|
||||
def get_parametrized_options(
|
||||
test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
create_new_process_for_each_test: bool,
|
||||
):
|
||||
"""Converts all of our VLMTestInfo into an expanded list of parameters.
|
||||
This is similar to nesting pytest parametrize calls, but done directly
|
||||
through an itertools product so that each test can set things like
|
||||
size factors etc, while still running in isolated test cases.
|
||||
"""
|
||||
matching_tests = get_filtered_test_settings(
|
||||
test_settings, test_type, create_new_process_for_each_test)
|
||||
test_settings, test_type, create_new_process_for_each_test
|
||||
)
|
||||
|
||||
# Ensure that something is wrapped as an iterable it's not already
|
||||
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
|
||||
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
|
||||
|
||||
def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
|
||||
# This is essentially the same as nesting a bunch of mark.parametrize
|
||||
# decorators, but we do it programmatically to allow overrides for on
|
||||
# a per-model basis, while still being able to execute each of these
|
||||
# as individual test cases in pytest.
|
||||
iter_kwargs = OrderedDict([
|
||||
("model", ensure_wrapped(test_info.models)),
|
||||
("max_tokens", ensure_wrapped(test_info.max_tokens)),
|
||||
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
|
||||
("dtype", ensure_wrapped(test_info.dtype)),
|
||||
("distributed_executor_backend",
|
||||
ensure_wrapped(test_info.distributed_executor_backend)),
|
||||
])
|
||||
iter_kwargs = OrderedDict(
|
||||
[
|
||||
("model", ensure_wrapped(test_info.models)),
|
||||
("max_tokens", ensure_wrapped(test_info.max_tokens)),
|
||||
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
|
||||
("dtype", ensure_wrapped(test_info.dtype)),
|
||||
(
|
||||
"distributed_executor_backend",
|
||||
ensure_wrapped(test_info.distributed_executor_backend),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
# num_frames is video only
|
||||
if test_type == VLMTestType.VIDEO:
|
||||
iter_kwargs["num_video_frames"] = ensure_wrapped(
|
||||
test_info.num_video_frames)
|
||||
iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
|
||||
|
||||
# No sizes passed for custom inputs, since inputs are directly provided
|
||||
if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
|
||||
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
|
||||
if wrapped_sizes is None:
|
||||
raise ValueError(
|
||||
f"Sizes must be set for test type {test_type}")
|
||||
raise ValueError(f"Sizes must be set for test type {test_type}")
|
||||
iter_kwargs["size_wrapper"] = wrapped_sizes
|
||||
|
||||
#Otherwise expand the custom test options instead
|
||||
# Otherwise expand the custom test options instead
|
||||
elif test_type == VLMTestType.CUSTOM_INPUTS:
|
||||
if test_info.custom_test_opts is None:
|
||||
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
|
||||
@@ -121,8 +138,8 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
|
||||
|
||||
|
||||
def get_wrapped_test_sizes(
|
||||
test_info: VLMTestInfo,
|
||||
test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]:
|
||||
test_info: VLMTestInfo, test_type: VLMTestType
|
||||
) -> tuple[ImageSizeWrapper, ...]:
|
||||
"""Given a test info which may have size factors or fixed sizes, wrap them
|
||||
and combine them into an iterable, each of which will be used in parameter
|
||||
expansion.
|
||||
@@ -133,18 +150,18 @@ def get_wrapped_test_sizes(
|
||||
"""
|
||||
# If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
|
||||
if test_type == VLMTestType.EMBEDDING:
|
||||
return tuple([
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
for factor in EMBEDDING_SIZE_FACTORS
|
||||
])
|
||||
return tuple(
|
||||
[
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
for factor in EMBEDDING_SIZE_FACTORS
|
||||
]
|
||||
)
|
||||
# Audio and Custom inputs have preprocessed inputs
|
||||
elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
|
||||
return tuple()
|
||||
|
||||
size_factors = test_info.image_size_factors \
|
||||
if test_info.image_size_factors else []
|
||||
fixed_sizes = test_info.image_sizes \
|
||||
if test_info.image_sizes else []
|
||||
size_factors = test_info.image_size_factors if test_info.image_size_factors else []
|
||||
fixed_sizes = test_info.image_sizes if test_info.image_sizes else []
|
||||
|
||||
wrapped_factors = [
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
@@ -152,8 +169,7 @@ def get_wrapped_test_sizes(
|
||||
]
|
||||
|
||||
wrapped_sizes = [
|
||||
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
|
||||
for size in fixed_sizes
|
||||
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size) for size in fixed_sizes
|
||||
]
|
||||
|
||||
return tuple(wrapped_factors + wrapped_sizes)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Core test implementation to be shared across modalities."""
|
||||
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
import torch
|
||||
@@ -70,22 +71,23 @@ def run_test(
|
||||
if model_info.hf_overrides:
|
||||
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
|
||||
if model_info.skip_tokenizer_init:
|
||||
vllm_runner_kwargs_[
|
||||
"skip_tokenizer_init"] = model_info.skip_tokenizer_init
|
||||
vllm_runner_kwargs_["skip_tokenizer_init"] = model_info.skip_tokenizer_init
|
||||
|
||||
if vllm_runner_kwargs:
|
||||
vllm_runner_kwargs_.update(vllm_runner_kwargs)
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=max_num_seqs,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=enforce_eager,
|
||||
runner=runner,
|
||||
**vllm_runner_kwargs_) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=max_num_seqs,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=enforce_eager,
|
||||
runner=runner,
|
||||
**vllm_runner_kwargs_,
|
||||
) as vllm_model:
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
|
||||
vllm_kwargs: dict[str, Any] = {}
|
||||
@@ -95,21 +97,19 @@ def run_test(
|
||||
vllm_kwargs["stop"] = stop_str
|
||||
|
||||
for prompts, image_data, video_data, audio_data in vllm_inputs:
|
||||
mm_data = dict(images=image_data,
|
||||
videos=video_data,
|
||||
audios=audio_data)
|
||||
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
|
||||
vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
|
||||
vllm_output = vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
**vllm_kwargs_with_mm_data)
|
||||
**vllm_kwargs_with_mm_data,
|
||||
)
|
||||
vllm_outputs_per_mm.append(vllm_output)
|
||||
|
||||
hf_model = hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=auto_cls,
|
||||
model_kwargs=hf_model_kwargs)
|
||||
hf_model = hf_runner(
|
||||
model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
|
||||
)
|
||||
|
||||
# Some models need to patch things like the model processor, e.g., internvl
|
||||
if patch_hf_runner is not None:
|
||||
@@ -129,16 +129,15 @@ def run_test(
|
||||
hf_kwargs["stop_strings"] = stop_str
|
||||
|
||||
for prompts, image_data, video_data, audio_data in inputs:
|
||||
mm_data = dict(images=image_data,
|
||||
videos=video_data,
|
||||
audios=audio_data)
|
||||
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
|
||||
hf_kwargs_with_mm_data = hf_kwargs | mm_data
|
||||
hf_output = hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tokenizer=tokenizer,
|
||||
**hf_kwargs_with_mm_data)
|
||||
**hf_kwargs_with_mm_data,
|
||||
)
|
||||
hf_outputs_per_mm.append(hf_output)
|
||||
|
||||
# Apply output processing / sanitation to the vLLM and HF runner results
|
||||
@@ -150,8 +149,7 @@ def run_test(
|
||||
second_runner_processor=vllm_output_post_proc,
|
||||
)
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
|
||||
vllm_outputs_per_mm):
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm, vllm_outputs_per_mm):
|
||||
# This is usually check_logprobs_close, but it's passed through to
|
||||
# allow things like check_outputs_equal where needed
|
||||
comparator(
|
||||
@@ -171,15 +169,19 @@ def process_runner_outputs(
|
||||
):
|
||||
"""Applies the runner processor(s) to the runner outputs, if any."""
|
||||
if first_runner_processor is not None:
|
||||
first_runner_outputs = process_outputs(first_runner_processor, model,
|
||||
first_runner_outputs)
|
||||
first_runner_outputs = process_outputs(
|
||||
first_runner_processor, model, first_runner_outputs
|
||||
)
|
||||
if second_runner_processor is not None:
|
||||
second_runner_outputs = process_outputs(second_runner_processor, model,
|
||||
second_runner_outputs)
|
||||
second_runner_outputs = process_outputs(
|
||||
second_runner_processor, model, second_runner_outputs
|
||||
)
|
||||
return first_runner_outputs, second_runner_outputs
|
||||
|
||||
|
||||
def process_outputs(output_processor, model, outputs_per_image):
|
||||
"""Applies a model specific post-processor function to a runner's output"""
|
||||
return [[output_processor(res, model) for res in outputs]
|
||||
for outputs in outputs_per_image]
|
||||
return [
|
||||
[output_processor(res, model) for res in outputs]
|
||||
for outputs in outputs_per_image
|
||||
]
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Custom input builders for edge-cases in different models."""
|
||||
|
||||
from typing import Callable
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import (rescale_video_size, resize_video,
|
||||
sample_frames_from_video)
|
||||
from vllm.multimodal.video import (
|
||||
rescale_video_size,
|
||||
resize_video,
|
||||
sample_frames_from_video,
|
||||
)
|
||||
|
||||
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
|
||||
from .builders import build_multi_image_inputs, build_single_image_inputs
|
||||
@@ -15,7 +19,7 @@ from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType
|
||||
|
||||
def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
|
||||
"""Builds inputs for multi-image (varied sizes/aspect ratio) testing.
|
||||
|
||||
|
||||
Args:
|
||||
formatter: model-specific prompt formatter.
|
||||
"""
|
||||
@@ -41,7 +45,7 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
|
||||
stop_sign,
|
||||
rescale_image_size(stop_sign, 0.25),
|
||||
cherry_blossom.resize((183, 488)),
|
||||
cherry_blossom.resize((488, 183))
|
||||
cherry_blossom.resize((488, 183)),
|
||||
],
|
||||
cherry_blossom,
|
||||
]
|
||||
@@ -54,10 +58,11 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
|
||||
]
|
||||
|
||||
|
||||
def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
|
||||
num_frames: int = 16):
|
||||
def multi_video_multi_aspect_ratio_inputs(
|
||||
formatter: Callable[[str], str], num_frames: int = 16
|
||||
):
|
||||
"""Builds inputs for multi-video (varied sizes/aspect ratio) testing.
|
||||
|
||||
|
||||
Args:
|
||||
formatter: model-specific prompt formatter.
|
||||
"""
|
||||
@@ -81,7 +86,7 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
|
||||
video,
|
||||
rescale_video_size(video, 0.25),
|
||||
resize_video(video, (183, 488)),
|
||||
resize_video(video, (488, 183))
|
||||
resize_video(video, (488, 183)),
|
||||
],
|
||||
video,
|
||||
]
|
||||
@@ -96,7 +101,9 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
|
||||
|
||||
def different_patch_input_cases_internvl():
|
||||
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
|
||||
formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
|
||||
formatter = (
|
||||
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"
|
||||
) # noqa: E501
|
||||
single_img_prompts = [
|
||||
"<image>\nWhat's the content in the center of the image?",
|
||||
"<image>\nWhat is the season?",
|
||||
@@ -115,14 +122,14 @@ def different_patch_input_cases_internvl():
|
||||
|
||||
|
||||
def windows_attention_image_qwen2_5_vl():
|
||||
|
||||
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
|
||||
image = ImageAsset("hato").pil_image
|
||||
|
||||
question = "Describe the image."
|
||||
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
prompt = (
|
||||
f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
|
||||
return build_single_image_inputs([image], [prompt], wrapped_sf)
|
||||
@@ -136,8 +143,9 @@ def video_with_metadata_glm4_1v():
|
||||
formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
|
||||
|
||||
scales = [0.1, 0.2, 0.25]
|
||||
video_input = [[(rescale_video_size(video_array, scale), metadata)]
|
||||
for scale in scales]
|
||||
video_input = [
|
||||
[(rescale_video_size(video_array, scale), metadata)] for scale in scales
|
||||
]
|
||||
prompts = [formatted_prompt] * len(video_input)
|
||||
|
||||
return [
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
for manipulating the input / output of HF & vLLM test runners, which are
|
||||
typically specific to a small subset of models.
|
||||
"""
|
||||
|
||||
import types
|
||||
from pathlib import PosixPath
|
||||
from typing import Optional, Union
|
||||
@@ -15,8 +16,13 @@ import pytest
|
||||
import regex as re
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
|
||||
GenerationConfig, GenerationMixin)
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
BatchFeature,
|
||||
GenerationConfig,
|
||||
GenerationMixin,
|
||||
)
|
||||
from transformers.video_utils import VideoMetadata
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
@@ -27,8 +33,7 @@ from .types import RunnerOutput
|
||||
|
||||
|
||||
####### vLLM output processors functions
|
||||
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [blip2 models] to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -42,8 +47,7 @@ def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [fuyu models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -53,8 +57,8 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
|
||||
|
||||
def qwen_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [qwen models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -64,8 +68,8 @@ def qwen_vllm_to_hf_output(
|
||||
|
||||
|
||||
def qwen2_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [qwen2 models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -75,8 +79,8 @@ def qwen2_vllm_to_hf_output(
|
||||
|
||||
|
||||
def kimiv_vl_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -85,23 +89,25 @@ def kimiv_vl_vllm_to_hf_output(
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def llava_image_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> RunnerOutput:
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
mm_token_id = config.image_token_index
|
||||
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
|
||||
|
||||
|
||||
def llava_video_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
mm_token_id = config.video_token_index
|
||||
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
|
||||
|
||||
|
||||
def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
|
||||
mm_token_id: int) -> RunnerOutput:
|
||||
def _llava_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput, model: str, mm_token_id: int
|
||||
) -> RunnerOutput:
|
||||
"""Sanitize vllm output [Llava models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -109,7 +115,8 @@ def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
token_id
|
||||
for idx, token_id in enumerate(output_ids)
|
||||
if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
|
||||
]
|
||||
|
||||
@@ -128,8 +135,9 @@ def llava_onevision_hf_model_kwargs(model: str) -> dict:
|
||||
return config.to_dict()
|
||||
|
||||
|
||||
def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def llava_onevision_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> RunnerOutput:
|
||||
"""Sanitize vllm output [llava-onevision] to compare with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -140,7 +148,8 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
token_id
|
||||
for idx, token_id in enumerate(output_ids)
|
||||
if token_id != video_token_id or output_ids[idx - 1] != video_token_id
|
||||
]
|
||||
|
||||
@@ -151,8 +160,7 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def mantis_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [mantis] to compare with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -161,8 +169,7 @@ def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [phi3v] to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -180,8 +187,7 @@ def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -192,7 +198,8 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
token_id
|
||||
for idx, token_id in enumerate(output_ids)
|
||||
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
|
||||
]
|
||||
|
||||
@@ -205,46 +212,40 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
|
||||
|
||||
####### Post-processors for HF outputs
|
||||
def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<|end▁of▁sentence|>"):
|
||||
output_str = output_str.split("<|end▁of▁sentence|>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def idefics3_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def idefics3_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<end_of_utterance>"):
|
||||
output_str = output_str.split("<end_of_utterance>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def smolvlm_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
# Based on Idefics3
|
||||
return idefics3_trunc_hf_output(hf_output, model)
|
||||
|
||||
|
||||
def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<|eot_id|>"):
|
||||
output_str = output_str.split("<|eot_id|>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def minimax_vl_01_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def minimax_vl_01_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<end_of_sentence>"):
|
||||
output_str = output_str.split("<end_of_sentence>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def ultravox_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def ultravox_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
@@ -262,8 +263,8 @@ def get_llava_embeddings(image_assets: ImageTestAssets):
|
||||
|
||||
####### Prompt path encoders for models that need models on disk
|
||||
def qwen_prompt_path_encoder(
|
||||
tmp_path: PosixPath, prompt: str,
|
||||
assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
|
||||
tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], ImageTestAssets]
|
||||
) -> str:
|
||||
"""Given a temporary dir path, export one or more image assets into the
|
||||
tempdir & replace its contents with the local path to the string so that
|
||||
the HF version of Qwen-VL can resolve the path and load the image in its
|
||||
@@ -313,8 +314,9 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
return BatchFeature(data=inputs, tensor_type="pt")
|
||||
|
||||
hf_model.processor = processor
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language.model.embed_tokens
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.language.model.embed_tokens
|
||||
)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -357,11 +359,10 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
assert len(contents) == len(images)
|
||||
|
||||
return hf_processor.apply_chat_template(
|
||||
[{
|
||||
"role": "user",
|
||||
"image": image,
|
||||
"content": content
|
||||
} for image, content in zip(images, contents)],
|
||||
[
|
||||
{"role": "user", "image": image, "content": content}
|
||||
for image, content in zip(images, contents)
|
||||
],
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
@@ -369,8 +370,9 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
)
|
||||
|
||||
hf_model.processor = processor
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.transformer.output_layer
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.transformer.output_layer
|
||||
)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -387,10 +389,9 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
else:
|
||||
video_metadata = None
|
||||
|
||||
return hf_processor(*args,
|
||||
videos=videos,
|
||||
video_metadata=video_metadata,
|
||||
**kwargs)
|
||||
return hf_processor(
|
||||
*args, videos=videos, video_metadata=video_metadata, **kwargs
|
||||
)
|
||||
|
||||
hf_model.processor = processor
|
||||
return hf_model
|
||||
@@ -406,8 +407,9 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.config = AutoConfig.from_pretrained(
|
||||
hf_runner.model_name, trust_remote_code=True
|
||||
)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.use_msac = self.config.use_msac
|
||||
@@ -415,11 +417,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.max_num = self.config.max_dynamic_patch
|
||||
self.image_size = self.vision_config.image_size
|
||||
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]],
|
||||
**kwargs):
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
|
||||
# yapf: disable
|
||||
from vllm.model_executor.models.h2ovl import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
image_to_pixel_values_h2ovl,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
@@ -431,29 +436,26 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
use_msac=self.use_msac,
|
||||
) for image in images
|
||||
]
|
||||
num_patches_list = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
for num_patches in num_patches_list:
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
text = text.replace("<image>", image_tokens, 1)
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = H2OVLProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.language_model.get_output_embeddings()
|
||||
)
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -467,19 +469,23 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.config = AutoConfig.from_pretrained(
|
||||
hf_runner.model_name, trust_remote_code=True
|
||||
)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.min_num = self.config.min_dynamic_patch
|
||||
self.max_num = self.config.max_dynamic_patch
|
||||
self.image_size = self.vision_config.image_size
|
||||
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]],
|
||||
**kwargs):
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
|
||||
from vllm.model_executor.models.skyworkr1v import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START,
|
||||
image_to_pixel_values_skyworkr1v)
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
image_to_pixel_values_skyworkr1v,
|
||||
)
|
||||
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
pixel_values = [
|
||||
image_to_pixel_values_skyworkr1v(
|
||||
@@ -488,29 +494,26 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
min_num=self.min_num,
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
) for image in images
|
||||
]
|
||||
num_patches_list = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
for num_patches in num_patches_list:
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
text = text.replace("<image>", image_tokens, 1)
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = SkyworkR1VProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.language_model.get_output_embeddings()
|
||||
)
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -524,8 +527,9 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.config = AutoConfig.from_pretrained(
|
||||
hf_runner.model_name, trust_remote_code=True
|
||||
)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.min_num = self.config.min_dynamic_patch
|
||||
@@ -540,8 +544,13 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
**kwargs,
|
||||
):
|
||||
from vllm.model_executor.models.internvl import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START,
|
||||
image_to_pixel_values_internvl, video_to_pixel_values_internvl)
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
image_to_pixel_values_internvl,
|
||||
video_to_pixel_values_internvl,
|
||||
)
|
||||
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
videos = [videos] if isinstance(videos, np.ndarray) else videos
|
||||
if images is not None:
|
||||
@@ -552,7 +561,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
min_num=self.min_num,
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
) for image in images
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
num_patches_images = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values_images
|
||||
@@ -568,7 +578,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
min_num=1,
|
||||
max_num=1,
|
||||
use_thumbnail=False,
|
||||
) for video in videos
|
||||
)
|
||||
for video in videos
|
||||
]
|
||||
num_patches_videos = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values_videos
|
||||
@@ -580,38 +591,37 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
while ("<image>" in text) or ("<video>" in text):
|
||||
image_index = text.find("<image>")
|
||||
video_index = text.find("<video>")
|
||||
if image_index == -1 or (video_index > -1
|
||||
and video_index < image_index):
|
||||
if image_index == -1 or (
|
||||
video_index > -1 and video_index < image_index
|
||||
):
|
||||
num_patches = num_patches_videos.pop(0)
|
||||
pixel_values.append(pixel_values_videos.pop(0))
|
||||
context_tokens = IMG_START + \
|
||||
IMG_CONTEXT * self.num_image_token + IMG_END
|
||||
video_tokens = ''.join([
|
||||
f'Frame{i+1}: {context_tokens}'
|
||||
for i in range(num_patches)
|
||||
])
|
||||
text = text.replace('<video>', video_tokens, 1)
|
||||
context_tokens = (
|
||||
IMG_START + IMG_CONTEXT * self.num_image_token + IMG_END
|
||||
)
|
||||
video_tokens = "".join(
|
||||
[f"Frame{i + 1}: {context_tokens}" for i in range(num_patches)]
|
||||
)
|
||||
text = text.replace("<video>", video_tokens, 1)
|
||||
else:
|
||||
num_patches = num_patches_images.pop(0)
|
||||
pixel_values.append(pixel_values_images.pop(0))
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
text = text.replace("<image>", image_tokens, 1)
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = InternVLProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.language_model.get_output_embeddings()
|
||||
)
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -631,7 +641,7 @@ def _internvl_generate(
|
||||
input_embeds = input_embeds.reshape(B * N, C)
|
||||
|
||||
input_ids = input_ids.reshape(B * N)
|
||||
selected = (input_ids == self.img_context_token_id)
|
||||
selected = input_ids == self.img_context_token_id
|
||||
assert selected.sum() != 0
|
||||
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
|
||||
|
||||
@@ -778,8 +788,9 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
|
||||
def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.llm.get_output_embeddings()
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.llm.get_output_embeddings()
|
||||
)
|
||||
|
||||
def processor(*args, text="", images=None, **kwargs):
|
||||
text_tokenizer = hf_model.model.get_text_tokenizer()
|
||||
@@ -787,8 +798,7 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
|
||||
prompt_start_and_end = {
|
||||
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
|
||||
"llama":
|
||||
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
|
||||
"llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
|
||||
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
|
||||
}
|
||||
for start, end in prompt_start_and_end.values():
|
||||
@@ -797,7 +807,8 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
break
|
||||
|
||||
prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
|
||||
text_or_conversations=text, images=images)
|
||||
text_or_conversations=text, images=images
|
||||
)
|
||||
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
|
||||
|
||||
inputs = {
|
||||
@@ -813,8 +824,9 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
|
||||
def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.llm.get_output_embeddings()
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.llm.get_output_embeddings()
|
||||
)
|
||||
|
||||
def processor(*args, text="", images=None, videos=None, **kwargs):
|
||||
if images is None:
|
||||
@@ -825,13 +837,11 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
videos = []
|
||||
else:
|
||||
videos = [videos] if isinstance(videos, np.ndarray) else videos
|
||||
videos = [[PIL.Image.fromarray(frame) for frame in vid]
|
||||
for vid in videos]
|
||||
videos = [[PIL.Image.fromarray(frame) for frame in vid] for vid in videos]
|
||||
|
||||
prompt_start_and_end = {
|
||||
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
|
||||
"llama":
|
||||
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
|
||||
"llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
|
||||
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
|
||||
}
|
||||
for start, end in prompt_start_and_end.values():
|
||||
@@ -842,21 +852,20 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
images_message = [{"type": "image", "image": img} for img in images]
|
||||
videos_message = [{"type": "video", "video": vid} for vid in videos]
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*images_message,
|
||||
*videos_message,
|
||||
{
|
||||
"type": "text",
|
||||
"text": text
|
||||
},
|
||||
],
|
||||
}]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*images_message,
|
||||
*videos_message,
|
||||
{"type": "text", "text": text},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
|
||||
messages=messages, enable_thinking=True)
|
||||
messages=messages, enable_thinking=True
|
||||
)
|
||||
inputs = {
|
||||
"inputs": input_ids,
|
||||
"pixel_values": pixel_values,
|
||||
|
||||
@@ -3,23 +3,34 @@
|
||||
"""Entrypoints for wrapping the core run_test implementation for specific test
|
||||
types / modalities.
|
||||
"""
|
||||
|
||||
from pathlib import PosixPath
|
||||
|
||||
from .....conftest import (AudioTestAssets, HfRunner, ImageTestAssets,
|
||||
VideoTestAssets, VllmRunner)
|
||||
from .....conftest import (
|
||||
AudioTestAssets,
|
||||
HfRunner,
|
||||
ImageTestAssets,
|
||||
VideoTestAssets,
|
||||
VllmRunner,
|
||||
)
|
||||
from . import builders, core
|
||||
from .types import ExpandableVLMTestArgs, VLMTestInfo
|
||||
|
||||
|
||||
####### Entrypoints for running different test types
|
||||
def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets):
|
||||
def run_single_image_test(
|
||||
*,
|
||||
tmp_path: PosixPath,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs = builders.build_single_image_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
@@ -31,17 +42,23 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets):
|
||||
def run_multi_image_test(
|
||||
*,
|
||||
tmp_path: PosixPath,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs = builders.build_multi_image_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
@@ -53,17 +70,22 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": len(image_assets)},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_embedding_test(*, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets):
|
||||
def run_embedding_test(
|
||||
*,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper)
|
||||
model_test_info, image_assets, test_case.size_wrapper
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
@@ -76,7 +98,8 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
vllm_embeddings=vllm_embeddings,
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_video_test(
|
||||
@@ -90,8 +113,11 @@ def run_video_test(
|
||||
assert test_case.size_wrapper is not None
|
||||
assert test_case.num_video_frames is not None
|
||||
inputs = builders.build_video_inputs_from_test_info(
|
||||
model_test_info, video_assets, test_case.size_wrapper,
|
||||
test_case.num_video_frames)
|
||||
model_test_info,
|
||||
video_assets,
|
||||
test_case.size_wrapper,
|
||||
test_case.num_video_frames,
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
@@ -103,7 +129,8 @@ def run_video_test(
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"video": len(video_assets)},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_audio_test(
|
||||
@@ -114,8 +141,7 @@ def run_audio_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets,
|
||||
):
|
||||
inputs = builders.build_audio_inputs_from_test_info(
|
||||
model_test_info, audio_assets)
|
||||
inputs = builders.build_audio_inputs_from_test_info(model_test_info, audio_assets)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
@@ -127,13 +153,17 @@ def run_audio_test(
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"audio": 1},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner]):
|
||||
def run_custom_inputs_test(
|
||||
*,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
):
|
||||
# Custom test cases can provide inputs directly, but they need to
|
||||
# explicitly provided a CustomTestConfig, which wraps the inputs and
|
||||
# the limit_mm_per_prompt
|
||||
@@ -155,4 +185,5 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Types for writing multimodal model tests."""
|
||||
|
||||
from collections.abc import Iterable
|
||||
from enum import Enum
|
||||
from pathlib import PosixPath
|
||||
@@ -15,9 +16,16 @@ from vllm.config import RunnerOption
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset,
|
||||
ImageTestAssets, PromptAudioInput, PromptImageInput,
|
||||
PromptVideoInput)
|
||||
from .....conftest import (
|
||||
AUDIO_ASSETS,
|
||||
IMAGE_ASSETS,
|
||||
HfRunner,
|
||||
ImageAsset,
|
||||
ImageTestAssets,
|
||||
PromptAudioInput,
|
||||
PromptImageInput,
|
||||
PromptVideoInput,
|
||||
)
|
||||
from ....utils import check_logprobs_close
|
||||
|
||||
# meta image tag; will be replaced by the appropriate tag for the model
|
||||
@@ -47,6 +55,7 @@ RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
|
||||
|
||||
class PromptWithMultiModalInput(NamedTuple):
|
||||
"""Holds the multimodal input for a single test case."""
|
||||
|
||||
prompts: list[str]
|
||||
image_data: Optional[PromptImageInput] = None
|
||||
video_data: Optional[PromptVideoInput] = None
|
||||
@@ -100,8 +109,9 @@ class VLMTestInfo(NamedTuple):
|
||||
|
||||
# Function for converting ImageAssets to image embeddings;
|
||||
# We need to define this explicitly for embedding tests
|
||||
convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
|
||||
list[torch.Tensor]]] = None
|
||||
convert_assets_to_embeddings: Optional[
|
||||
Callable[[ImageTestAssets], list[torch.Tensor]]
|
||||
] = None
|
||||
|
||||
# Exposed options for vLLM runner; we change these in a several tests,
|
||||
# but the defaults are derived from VllmRunner & the engine defaults
|
||||
@@ -156,8 +166,8 @@ class VLMTestInfo(NamedTuple):
|
||||
# for Qwen-VL, which requires encoding the image path / url into the prompt
|
||||
# for HF runner
|
||||
prompt_path_encoder: Optional[
|
||||
Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]],
|
||||
str]] = None # noqa: E501
|
||||
Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]], str]
|
||||
] = None # noqa: E501
|
||||
|
||||
# Allows configuring a test to run with custom inputs
|
||||
custom_test_opts: Optional[list[CustomTestOptions]] = None
|
||||
@@ -190,6 +200,7 @@ class VLMTestInfo(NamedTuple):
|
||||
|
||||
class ExpandableVLMTestArgs(NamedTuple):
|
||||
"""The expanded kwargs which correspond to a single test case."""
|
||||
|
||||
model: str
|
||||
max_tokens: int
|
||||
num_logprobs: int
|
||||
|
||||
Reference in New Issue
Block a user