[CI/Build] Reorganize models tests (#17459)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
238
tests/models/multimodal/generation/vlm_utils/builders.py
Normal file
238
tests/models/multimodal/generation/vlm_utils/builders.py
Normal file
@@ -0,0 +1,238 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Helpers for building inputs that can be leveraged for different test types.
|
||||
"""
|
||||
from collections.abc import Iterable
|
||||
from pathlib import PosixPath
|
||||
from typing import Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import (rescale_video_size, resize_video,
|
||||
sample_frames_from_video)
|
||||
|
||||
from .....conftest import _ImageAssets, _VideoAssets
|
||||
from .types import (SINGLE_IMAGE_BASE_PROMPTS, TEST_IMG_PLACEHOLDER,
|
||||
TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
|
||||
ImageSizeWrapper, SizeType, VLMTestInfo)
|
||||
|
||||
|
||||
def replace_test_placeholder(prompt: str, img_idx_to_prompt: Callable[[int],
|
||||
str],
|
||||
test_placeholder: str) -> str:
|
||||
"""Given a prompt, replaces each test placeholder with the
|
||||
model-specific tag.
|
||||
"""
|
||||
prompt_segments = prompt.split(test_placeholder)
|
||||
img_prompt = prompt_segments[0]
|
||||
for placeholder_idx, next_seg in enumerate(prompt_segments[1:], start=1):
|
||||
img_prompt += img_idx_to_prompt(placeholder_idx)
|
||||
img_prompt += next_seg
|
||||
return img_prompt
|
||||
|
||||
|
||||
def get_model_prompts(base_prompts: Iterable[str],
|
||||
img_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
video_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
prompt_formatter: Callable[[str], str]) -> list[str]:
|
||||
"""Given a model-agnostic base prompt and test configuration for a model(s)
|
||||
to be tested, update the media placeholders and apply the prompt formatting
|
||||
to get the test prompt string for this model.
|
||||
|
||||
Example for phi3v, given the base_prompt: "<image>What is the season?"
|
||||
1. Replace img placeholder(s)
|
||||
-> "<|image_1|>\nWhat is the season?"
|
||||
2. Apply prompt formatter:
|
||||
-> <|user|>\n<|image_1|>\nWhat is the season?<|end|>\n<|assistant|>\n
|
||||
"""
|
||||
assert isinstance(base_prompts, (list, tuple))
|
||||
model_prompts = []
|
||||
for base_prompt in base_prompts:
|
||||
# Replace the multimodal placeholders in the base prompt with
|
||||
# the correct ones for the model that we are testing
|
||||
if img_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(base_prompt,
|
||||
img_idx_to_prompt,
|
||||
TEST_IMG_PLACEHOLDER)
|
||||
|
||||
if video_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(base_prompt,
|
||||
video_idx_to_prompt,
|
||||
TEST_VIDEO_PLACEHOLDER)
|
||||
|
||||
# Apply the prompt formatter to wrap the base prompt with
|
||||
# the correct media placeholders to get the model test prompt
|
||||
model_prompt = prompt_formatter(base_prompt)
|
||||
model_prompts.append(model_prompt)
|
||||
return model_prompts
|
||||
|
||||
|
||||
def build_single_image_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
image_assets: _ImageAssets,
|
||||
size_wrapper: ImageSizeWrapper,
|
||||
tmp_path: Optional[PosixPath] = None):
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError(
|
||||
"Prompt formatter must be set to build single image inputs")
|
||||
|
||||
model_prompts = get_model_prompts(test_info.single_image_prompts,
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.prompt_formatter)
|
||||
|
||||
# For models that require a local path / URL encoded in the image; export
|
||||
# assets and encode into tmp_path for this test. This should be avoided
|
||||
# where possible (currently needed for Qwen-VL).
|
||||
if test_info.prompt_path_encoder is not None:
|
||||
if tmp_path is None:
|
||||
raise ValueError("Prompt path encoder requires setting local path")
|
||||
model_prompts = [
|
||||
test_info.prompt_path_encoder(tmp_path, prompt, [asset])
|
||||
for prompt, asset in zip(model_prompts, image_assets)
|
||||
]
|
||||
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
assert len(images) == len(model_prompts)
|
||||
return build_single_image_inputs(images, model_prompts, size_wrapper)
|
||||
|
||||
|
||||
def build_single_image_inputs(images, model_prompts,
|
||||
size_wrapper: ImageSizeWrapper):
|
||||
# For every image / prompt pair, get a pair containing two lists of
|
||||
# length size_factors, where the first contains duplicates of the model
|
||||
# prompt [str], and the second contains copies of the image after being
|
||||
# scaled by one of the size factors.
|
||||
#
|
||||
# NOTE: rescaling preserves the image aspect ratio.
|
||||
return [(
|
||||
[prompt for _ in size_wrapper.data],
|
||||
[
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for size in size_wrapper.data
|
||||
],
|
||||
) for image, prompt in zip(images, model_prompts)]
|
||||
|
||||
|
||||
def build_multi_image_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
image_assets: _ImageAssets,
|
||||
size_wrapper: ImageSizeWrapper,
|
||||
tmp_path: Optional[PosixPath] = None):
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError(
|
||||
"Prompt formatter must be set to build multi image inputs")
|
||||
|
||||
model_prompts = get_model_prompts([test_info.multi_image_prompt],
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.prompt_formatter)
|
||||
|
||||
if test_info.prompt_path_encoder is not None:
|
||||
if tmp_path is None:
|
||||
raise ValueError("Prompt path encoder requires setting local path")
|
||||
model_prompts = [
|
||||
test_info.prompt_path_encoder(tmp_path, model_prompt, image_assets)
|
||||
for model_prompt in model_prompts
|
||||
]
|
||||
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
# Currently, we only have one multi-image list & one multi-image prompt
|
||||
return build_multi_image_inputs(
|
||||
image_lists=[images],
|
||||
model_prompts=model_prompts,
|
||||
size_wrapper=size_wrapper,
|
||||
)
|
||||
|
||||
|
||||
def build_multi_image_inputs(image_lists, model_prompts,
|
||||
size_wrapper: ImageSizeWrapper):
|
||||
return [(
|
||||
[prompt for _ in size_wrapper.data],
|
||||
[[
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for image in images
|
||||
] for size in size_wrapper.data],
|
||||
) for images, prompt in zip(image_lists, model_prompts)]
|
||||
|
||||
|
||||
def build_embedding_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
image_assets: _ImageAssets,
|
||||
size_wrapper: ImageSizeWrapper,
|
||||
):
|
||||
# These conditions will always be true if invoked through filtering,
|
||||
# but we still check them in case this is ever called directly
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError(
|
||||
"Prompt formatter must be set to build image embedding inputs")
|
||||
if size_wrapper.type != SizeType.SIZE_FACTOR or not \
|
||||
all(factor == 1.0 for factor in size_wrapper.data):
|
||||
raise ValueError("Embedding tests require constant (1.0) size factors")
|
||||
if test_info.convert_assets_to_embeddings is None:
|
||||
raise ValueError("No conversion func for getting embeddings found")
|
||||
|
||||
model_prompts = get_model_prompts(
|
||||
SINGLE_IMAGE_BASE_PROMPTS,
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
embeds = test_info.convert_assets_to_embeddings(image_assets)
|
||||
assert len(images) == len(model_prompts)
|
||||
|
||||
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
|
||||
vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
|
||||
size_wrapper)
|
||||
return inputs, vllm_embeddings
|
||||
|
||||
|
||||
def build_video_inputs_from_test_info(
|
||||
test_info: VLMTestInfo,
|
||||
video_assets: _VideoAssets,
|
||||
size_wrapper: ImageSizeWrapper,
|
||||
num_frames: int,
|
||||
):
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError("Prompt formatter must be set to build video inputs")
|
||||
model_prompts = get_model_prompts(
|
||||
[VIDEO_BASE_PROMPT],
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
|
||||
else rescale_video_size)
|
||||
|
||||
return [(
|
||||
[prompt for _ in size_wrapper.data],
|
||||
[video_scaler(video, size) for size in size_wrapper.data],
|
||||
) for video, prompt in zip(sampled_vids, model_prompts)]
|
||||
|
||||
|
||||
def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
|
||||
size_type: SizeType):
|
||||
"""Applies a size scaler to one image; this can be a an image size factor,
|
||||
which scales the image while maintaining the aspect ratio"""
|
||||
# Special case for embeddings; if it's a tensor, it's only valid if we
|
||||
# are considering size factors at constant scale, i.e., we just clone
|
||||
# the tensor
|
||||
if isinstance(image, torch.Tensor):
|
||||
assert size_type == SizeType.SIZE_FACTOR and size == 1
|
||||
return image
|
||||
if size_type == SizeType.SIZE_FACTOR:
|
||||
# We have a list of image size factors
|
||||
return rescale_image_size(image, size)
|
||||
elif size_type == SizeType.FIXED_SIZE:
|
||||
# We have a list of fixed sizes
|
||||
return image.resize(size)
|
||||
raise ValueError("ImageSizeWrapper type must be FIXED_SIZE or SIZE_FACTOR")
|
||||
158
tests/models/multimodal/generation/vlm_utils/case_filtering.py
Normal file
158
tests/models/multimodal/generation/vlm_utils/case_filtering.py
Normal file
@@ -0,0 +1,158 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Utils for determining which subset of model tests belong to a specific
|
||||
modality, getting all combinations (similar to pytest's parametrization),
|
||||
handling multimodal placeholder substitution, and so on.
|
||||
"""
|
||||
import itertools
|
||||
from collections import OrderedDict
|
||||
from collections.abc import Iterable
|
||||
|
||||
import pytest
|
||||
|
||||
from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
|
||||
ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
|
||||
|
||||
|
||||
def get_filtered_test_settings(
|
||||
test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
|
||||
new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
|
||||
"""Given the dict of potential test settings to run, return a subdict
|
||||
of tests who have the current test type enabled with the matching val for
|
||||
fork_per_test.
|
||||
"""
|
||||
|
||||
def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
|
||||
return test_info.test_type == test_type or (
|
||||
isinstance(test_info.test_type, Iterable)
|
||||
and test_type in test_info.test_type)
|
||||
|
||||
matching_tests = {}
|
||||
for test_name, test_info in test_settings.items():
|
||||
# Otherwise check if the test has the right type & keep if it does
|
||||
if matches_test_type(test_info, test_type):
|
||||
# Embedding tests need to have a conversion func in their test info
|
||||
if matches_test_type(test_info, VLMTestType.EMBEDDING):
|
||||
assert test_info.convert_assets_to_embeddings is not None
|
||||
# Custom test inputs need to explicitly define the mm limit/inputs
|
||||
if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
|
||||
assert (test_info.custom_test_opts is not None
|
||||
and isinstance(test_info.custom_test_opts, Iterable))
|
||||
# For all types besides custom inputs, we need a prompt formatter
|
||||
else:
|
||||
assert test_info.prompt_formatter is not None
|
||||
|
||||
# Everything looks okay; keep if this is has correct proc handling
|
||||
if (test_info.distributed_executor_backend
|
||||
is not None) == new_proc_per_test:
|
||||
matching_tests[test_name] = test_info
|
||||
|
||||
return matching_tests
|
||||
|
||||
|
||||
def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
create_new_process_for_each_test: bool):
|
||||
"""Converts all of our VLMTestInfo into an expanded list of parameters.
|
||||
This is similar to nesting pytest parametrize calls, but done directly
|
||||
through an itertools product so that each test can set things like
|
||||
size factors etc, while still running in isolated test cases.
|
||||
"""
|
||||
matching_tests = get_filtered_test_settings(
|
||||
test_settings, test_type, create_new_process_for_each_test)
|
||||
|
||||
# Ensure that something is wrapped as an iterable it's not already
|
||||
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
|
||||
|
||||
def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
|
||||
# This is essentially the same as nesting a bunch of mark.parametrize
|
||||
# decorators, but we do it programmatically to allow overrides for on
|
||||
# a per-model basis, while still being able to execute each of these
|
||||
# as individual test cases in pytest.
|
||||
iter_kwargs = OrderedDict([
|
||||
("model", ensure_wrapped(test_info.models)),
|
||||
("max_tokens", ensure_wrapped(test_info.max_tokens)),
|
||||
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
|
||||
("dtype", ensure_wrapped(test_info.dtype)),
|
||||
("distributed_executor_backend",
|
||||
ensure_wrapped(test_info.distributed_executor_backend)),
|
||||
])
|
||||
|
||||
# num_frames is video only
|
||||
if test_type == VLMTestType.VIDEO:
|
||||
iter_kwargs["num_video_frames"] = ensure_wrapped(
|
||||
test_info.num_video_frames)
|
||||
|
||||
# No sizes passed for custom inputs, since inputs are directly provided
|
||||
if test_type != VLMTestType.CUSTOM_INPUTS:
|
||||
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
|
||||
if wrapped_sizes is None:
|
||||
raise ValueError(
|
||||
f"Sizes must be set for test type {test_type}")
|
||||
iter_kwargs["size_wrapper"] = wrapped_sizes
|
||||
|
||||
#Otherwise expand the custom test options instead
|
||||
else:
|
||||
if test_info.custom_test_opts is None:
|
||||
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
|
||||
iter_kwargs["custom_test_opts"] = test_info.custom_test_opts
|
||||
|
||||
# yapf: disable
|
||||
# Wrap all model cases in a pytest parameter & pass marks through
|
||||
return [
|
||||
pytest.param(
|
||||
model_type,
|
||||
ExpandableVLMTestArgs(
|
||||
**{k: v for k, v in zip(iter_kwargs.keys(), case)}
|
||||
),
|
||||
marks=test_info.marks if test_info.marks is not None else []
|
||||
) for case in list(itertools.product(*iter_kwargs.values()))
|
||||
]
|
||||
# yapf: enable
|
||||
|
||||
# Get a list per model type, where each entry contains a tuple of all of
|
||||
# that model type's cases, then flatten them into the top level so that
|
||||
# we can consume them in one mark.parametrize call.
|
||||
cases_by_model_type = [
|
||||
get_model_type_cases(model_type, test_info)
|
||||
for model_type, test_info in matching_tests.items()
|
||||
]
|
||||
return list(itertools.chain(*cases_by_model_type))
|
||||
|
||||
|
||||
def get_wrapped_test_sizes(
|
||||
test_info: VLMTestInfo,
|
||||
test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]:
|
||||
"""Given a test info which may have size factors or fixed sizes, wrap them
|
||||
and combine them into an iterable, each of which will be used in parameter
|
||||
expansion.
|
||||
|
||||
Args:
|
||||
test_info: Test configuration to be expanded.
|
||||
test_type: The type of test being filtered for.
|
||||
"""
|
||||
# If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
|
||||
if test_type == VLMTestType.EMBEDDING:
|
||||
return tuple([
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
for factor in EMBEDDING_SIZE_FACTORS
|
||||
])
|
||||
# Custom inputs have preprocessed inputs
|
||||
elif test_type == VLMTestType.CUSTOM_INPUTS:
|
||||
return tuple()
|
||||
|
||||
size_factors = test_info.image_size_factors \
|
||||
if test_info.image_size_factors else []
|
||||
fixed_sizes = test_info.image_sizes \
|
||||
if test_info.image_sizes else []
|
||||
|
||||
wrapped_factors = [
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
for factor in size_factors
|
||||
]
|
||||
|
||||
wrapped_sizes = [
|
||||
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
|
||||
for size in fixed_sizes
|
||||
]
|
||||
|
||||
return tuple(wrapped_factors + wrapped_sizes)
|
||||
176
tests/models/multimodal/generation/vlm_utils/core.py
Normal file
176
tests/models/multimodal/generation/vlm_utils/core.py
Normal file
@@ -0,0 +1,176 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Core test implementation to be shared across modalities."""
|
||||
from typing import Any, Callable, Optional, Union
|
||||
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from vllm.config import TaskOption
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
from .....conftest import HfRunner, VllmRunner
|
||||
from ....registry import HF_EXAMPLE_MODELS
|
||||
from .types import RunnerOutput
|
||||
|
||||
|
||||
def run_test(
|
||||
*,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: list[tuple[list[str], list[Union[list[Image], Image]]]],
|
||||
model: str,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
enforce_eager: bool,
|
||||
max_model_len: int,
|
||||
max_num_seqs: int,
|
||||
hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
|
||||
vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]],
|
||||
auto_cls: type[_BaseAutoModelClass],
|
||||
use_tokenizer_eos: bool,
|
||||
comparator: Callable[..., None],
|
||||
get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]],
|
||||
stop_str: Optional[list[str]],
|
||||
limit_mm_per_prompt: dict[str, int],
|
||||
vllm_runner_kwargs: Optional[dict[str, Any]],
|
||||
hf_model_kwargs: Optional[dict[str, Any]],
|
||||
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]],
|
||||
task: TaskOption = "auto",
|
||||
runner_mm_key: str = "images",
|
||||
distributed_executor_backend: Optional[str] = None,
|
||||
tensor_parallel_size: int = 1,
|
||||
vllm_embeddings: Optional[torch.Tensor] = None,
|
||||
):
|
||||
"""Modality agnostic test test executor for comparing HF/vLLM outputs."""
|
||||
# In the case of embeddings, vLLM takes separate input tensors
|
||||
vllm_inputs = vllm_embeddings if vllm_embeddings is not None else inputs
|
||||
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
# Disable other modalities to save memory
|
||||
default_limits = {"image": 0, "video": 0, "audio": 0}
|
||||
limit_mm_per_prompt = default_limits | limit_mm_per_prompt
|
||||
|
||||
vllm_outputs_per_mm = []
|
||||
hf_outputs_per_mm = []
|
||||
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
|
||||
vllm_runner_kwargs_: dict[str, Any] = {
|
||||
"disable_mm_preprocessor_cache": True,
|
||||
}
|
||||
if model_info.tokenizer:
|
||||
vllm_runner_kwargs_["tokenizer_name"] = model_info.tokenizer
|
||||
if model_info.tokenizer_mode:
|
||||
vllm_runner_kwargs_["tokenizer_mode"] = model_info.tokenizer_mode
|
||||
if model_info.hf_overrides:
|
||||
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
|
||||
|
||||
if vllm_runner_kwargs:
|
||||
vllm_runner_kwargs_.update(vllm_runner_kwargs)
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=max_num_seqs,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=enforce_eager,
|
||||
task=task,
|
||||
**vllm_runner_kwargs_) as vllm_model:
|
||||
tokenizer = vllm_model.model.get_tokenizer()
|
||||
|
||||
vllm_kwargs: dict[str, Any] = {}
|
||||
if get_stop_token_ids is not None:
|
||||
vllm_kwargs["stop_token_ids"] = get_stop_token_ids(tokenizer)
|
||||
if stop_str:
|
||||
vllm_kwargs["stop"] = stop_str
|
||||
|
||||
for prompts, media in vllm_inputs:
|
||||
vllm_kwargs[runner_mm_key] = media
|
||||
vllm_output = vllm_model.generate_greedy_logprobs(
|
||||
prompts, max_tokens, num_logprobs=num_logprobs, **vllm_kwargs)
|
||||
vllm_outputs_per_mm.append(vllm_output)
|
||||
|
||||
hf_model = hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=auto_cls,
|
||||
model_kwargs=hf_model_kwargs)
|
||||
|
||||
# Some models need to patch things like the model processor, e.g., internvl
|
||||
if patch_hf_runner is not None:
|
||||
hf_model = patch_hf_runner(hf_model)
|
||||
|
||||
with hf_model, torch.no_grad():
|
||||
tokenizer = hf_model.tokenizer
|
||||
|
||||
# Some models need to explicitly pass the eos_token_id off the tokenizer
|
||||
# or processor for a good comparison;
|
||||
# currently assume processor/tokenizer agree on the EOS, and pull it off
|
||||
# the tokenizer if requested.
|
||||
hf_kwargs = {}
|
||||
if use_tokenizer_eos:
|
||||
hf_kwargs["eos_token_id"] = tokenizer.eos_token_id
|
||||
if stop_str:
|
||||
hf_kwargs["stop_strings"] = stop_str
|
||||
|
||||
for prompts, media in inputs:
|
||||
hf_kwargs[runner_mm_key] = media
|
||||
hf_output = hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tokenizer=tokenizer,
|
||||
**hf_kwargs)
|
||||
hf_outputs_per_mm.append(hf_output)
|
||||
|
||||
# Apply output processing / sanitation to the vLLM and HF runner results
|
||||
hf_outputs_per_mm, vllm_outputs_per_mm = process_runner_outputs(
|
||||
model,
|
||||
first_runner_outputs=hf_outputs_per_mm,
|
||||
second_runner_outputs=vllm_outputs_per_mm,
|
||||
first_runner_processor=hf_output_post_proc,
|
||||
second_runner_processor=vllm_output_post_proc,
|
||||
)
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
|
||||
vllm_outputs_per_mm):
|
||||
# This is usually check_logprobs_close, but it's passed through to
|
||||
# allow things like check_outputs_equal where needed
|
||||
comparator(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
|
||||
|
||||
def process_runner_outputs(
|
||||
model,
|
||||
first_runner_outputs,
|
||||
second_runner_outputs,
|
||||
first_runner_processor=None,
|
||||
second_runner_processor=None,
|
||||
):
|
||||
"""Applies the runner processor(s) to the runner outputs, if any."""
|
||||
if first_runner_processor is not None:
|
||||
first_runner_outputs = process_outputs(first_runner_processor, model,
|
||||
first_runner_outputs)
|
||||
if second_runner_processor is not None:
|
||||
second_runner_outputs = process_outputs(second_runner_processor, model,
|
||||
second_runner_outputs)
|
||||
return first_runner_outputs, second_runner_outputs
|
||||
|
||||
|
||||
def process_outputs(output_processor, model, outputs_per_image):
|
||||
"""Applies a model specific post-processor function to a runner's output"""
|
||||
return [[output_processor(res, model) for res in outputs]
|
||||
for outputs in outputs_per_image]
|
||||
122
tests/models/multimodal/generation/vlm_utils/custom_inputs.py
Normal file
122
tests/models/multimodal/generation/vlm_utils/custom_inputs.py
Normal file
@@ -0,0 +1,122 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Custom input builders for edge-cases in different models."""
|
||||
from io import BytesIO
|
||||
from typing import Callable
|
||||
|
||||
import requests
|
||||
from PIL import Image
|
||||
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import (rescale_video_size, resize_video,
|
||||
sample_frames_from_video)
|
||||
|
||||
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
|
||||
from .builders import build_multi_image_inputs, build_single_image_inputs
|
||||
from .types import ImageSizeWrapper, SizeType
|
||||
|
||||
|
||||
def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
|
||||
"""Builds inputs for multi-image (varied sizes/aspect ratio) testing.
|
||||
|
||||
Args:
|
||||
formatter: model-specific prompt formatter.
|
||||
"""
|
||||
stop_sign = IMAGE_ASSETS[0].pil_image
|
||||
cherry_blossom = IMAGE_ASSETS[1].pil_image
|
||||
|
||||
# Apply the selected formatter to the base prompts
|
||||
img_prompts = [
|
||||
"<image><image>\nDescribe 2 images.",
|
||||
"<image><image>\nDescribe 2 images.",
|
||||
"<image><image><image><image>\nDescribe 4 images.",
|
||||
"<image>\nWhat is the season?",
|
||||
]
|
||||
formatted_prompts = [formatter(prompt) for prompt in img_prompts]
|
||||
|
||||
return [(
|
||||
formatted_prompts,
|
||||
[
|
||||
[stop_sign, cherry_blossom],
|
||||
# Images with different sizes and aspect-ratios
|
||||
[
|
||||
rescale_image_size(stop_sign, 0.1),
|
||||
stop_sign,
|
||||
],
|
||||
[
|
||||
stop_sign,
|
||||
rescale_image_size(stop_sign, 0.25),
|
||||
cherry_blossom.resize((183, 488)),
|
||||
cherry_blossom.resize((488, 183))
|
||||
],
|
||||
cherry_blossom,
|
||||
])]
|
||||
|
||||
|
||||
def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
|
||||
num_frames: int = 16):
|
||||
"""Builds inputs for multi-video (varied sizes/aspect ratio) testing.
|
||||
|
||||
Args:
|
||||
formatter: model-specific prompt formatter.
|
||||
"""
|
||||
video = sample_frames_from_video(VIDEO_ASSETS[0].np_ndarrays, num_frames)
|
||||
# Apply the selected formatter to the base prompts
|
||||
video_prompts = [
|
||||
"<video><video>\nDescribe 2 videos.",
|
||||
"<video><video>\nDescribe 2 videos.",
|
||||
"<video><video><video><video>\nDescribe 4 videos.",
|
||||
"<video>\nWhy is this video funny?",
|
||||
]
|
||||
formatted_prompts = [formatter(prompt) for prompt in video_prompts]
|
||||
|
||||
return [(
|
||||
formatted_prompts,
|
||||
[
|
||||
[video, video],
|
||||
# Videos with different sizes and aspect-ratios
|
||||
[
|
||||
rescale_video_size(video, 0.1),
|
||||
video,
|
||||
],
|
||||
[
|
||||
video,
|
||||
rescale_video_size(video, 0.25),
|
||||
resize_video(video, (183, 488)),
|
||||
resize_video(video, (488, 183))
|
||||
],
|
||||
video,
|
||||
])]
|
||||
|
||||
|
||||
def different_patch_input_cases_internvl():
|
||||
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
|
||||
formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
|
||||
single_img_prompts = [
|
||||
"<image>\nWhat's the content in the center of the image?",
|
||||
"<image>\nWhat is the season?",
|
||||
]
|
||||
multi_img_prompts = [
|
||||
"Image-1: <image>\nImage-2: <image>\nDescribe the two images in detail.\n", # noqa: E501
|
||||
]
|
||||
formatted_sprompts = [formatter(prompt) for prompt in single_img_prompts]
|
||||
formatted_mprompts = [formatter(prompt) for prompt in multi_img_prompts]
|
||||
|
||||
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5, 1.0])
|
||||
return [
|
||||
build_single_image_inputs(images, formatted_sprompts, wrapped_sf),
|
||||
build_multi_image_inputs([images], formatted_mprompts, wrapped_sf),
|
||||
]
|
||||
|
||||
|
||||
def windows_attention_image_qwen2_5_vl():
|
||||
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122
|
||||
image_url = "https://aomediacodec.github.io/av1-avif/testFiles/Link-U/hato.jpg"
|
||||
image = Image.open(BytesIO(requests.get(image_url).content))
|
||||
|
||||
question = "Describe the image."
|
||||
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
|
||||
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
|
||||
return build_single_image_inputs([image], [prompt], wrapped_sf)
|
||||
708
tests/models/multimodal/generation/vlm_utils/model_utils.py
Normal file
708
tests/models/multimodal/generation/vlm_utils/model_utils.py
Normal file
@@ -0,0 +1,708 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Common utility functions relating to different models that are useful
|
||||
for manipulating the input / output of HF & vLLM test runners, which are
|
||||
typically specific to a small subset of models.
|
||||
"""
|
||||
import re
|
||||
import types
|
||||
from pathlib import PosixPath
|
||||
from typing import Optional, Union
|
||||
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
|
||||
GenerationConfig)
|
||||
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.transformers_utils.tokenizer import patch_padding_side
|
||||
|
||||
from .....conftest import HfRunner, ImageAsset, _ImageAssets
|
||||
from .types import RunnerOutput
|
||||
|
||||
|
||||
####### vLLM output processors functions
|
||||
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [blip2 models] to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str + "\n"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
hf_output_ids = tokenizer.encode(hf_output_str)
|
||||
assert hf_output_ids[0] == tokenizer.bos_token_id
|
||||
hf_output_ids = hf_output_ids[1:]
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [fuyu models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str.lstrip() + "|ENDOFTEXT|"
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def qwen_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [qwen models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str + "<|endoftext|>"
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def qwen2_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [qwen2 models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str + "<|im_end|>"
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def kimiv_vl_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str + "<|im_end|>[EOS]"
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
mm_token_id = config.image_token_index
|
||||
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
|
||||
|
||||
|
||||
def llava_video_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
mm_token_id = config.video_token_index
|
||||
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
|
||||
|
||||
|
||||
def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
|
||||
mm_token_id: int) -> RunnerOutput:
|
||||
"""Sanitize vllm output [Llava models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
|
||||
]
|
||||
|
||||
assert output_str[0] == " "
|
||||
hf_output_str = output_str[1:]
|
||||
if hf_output_ids[-1] == eos_token_id:
|
||||
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def llava_onevision_hf_model_kwargs(model: str) -> dict:
|
||||
"""Workaround to fix the sliding window issue in llava_onevision."""
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
config.text_config.sliding_window = None
|
||||
return config.to_dict()
|
||||
|
||||
|
||||
def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [llava-onevision] to compare with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
video_token_id = config.video_token_index
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
if token_id != video_token_id or output_ids[idx - 1] != video_token_id
|
||||
]
|
||||
|
||||
hf_output_str = output_str
|
||||
if hf_output_ids[-1] == eos_token_id:
|
||||
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [mantis] to compare with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
hf_output_str = output_str + "<|eot_id|>"
|
||||
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [phi3v] to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
output_str_without_image = re.sub(r"(<\|image_\d+\|>)+", "", output_str)
|
||||
assert output_str_without_image[0] == " "
|
||||
output_str_without_image = output_str_without_image[1:]
|
||||
|
||||
hf_output_str = output_str_without_image + "<|end|><|endoftext|>"
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
hf_output_ids = tokenizer.encode(output_str_without_image)
|
||||
assert hf_output_ids[0] == 1
|
||||
hf_output_ids = hf_output_ids[1:]
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
image_token_id = config.image_token_index
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
|
||||
]
|
||||
|
||||
hf_output_str = output_str
|
||||
|
||||
if hf_output_ids[-1] == eos_token_id:
|
||||
hf_output_str = hf_output_str + tokenizer.decode(eos_token_id)
|
||||
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
####### Post-processors for HF outputs
|
||||
def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<|end▁of▁sentence|>"):
|
||||
output_str = output_str.split("<|end▁of▁sentence|>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def idefics3_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<end_of_utterance>"):
|
||||
output_str = output_str.split("<end_of_utterance>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
# Based on Idefics3
|
||||
return idefics3_trunc_hf_output(hf_output, model)
|
||||
|
||||
|
||||
def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<|eot_id|>"):
|
||||
output_str = output_str.split("<|eot_id|>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def minimax_vl_01_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<end_of_sentence>"):
|
||||
output_str = output_str.split("<end_of_sentence>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
####### Functions for converting image assets to embeddings
|
||||
def get_llava_embeddings(image_assets: _ImageAssets):
|
||||
return [asset.image_embeds for asset in image_assets]
|
||||
|
||||
|
||||
####### Prompt path encoders for models that need models on disk
|
||||
def qwen_prompt_path_encoder(
|
||||
tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset],
|
||||
_ImageAssets]) -> str:
|
||||
"""Given a temporary dir path, export one or more image assets into the
|
||||
tempdir & replace its contents with the local path to the string so that
|
||||
the HF version of Qwen-VL can resolve the path and load the image in its
|
||||
forward() call.
|
||||
|
||||
Args:
|
||||
tmp_path: Tempdir for test under consideration.
|
||||
prompt: Prompt with image placeholders.
|
||||
assets: list of image assets whose len equals the num placeholders.
|
||||
"""
|
||||
# Ensure that the number of placeholders matches the number of assets;
|
||||
# If this is not true, the test is probably written incorrectly.
|
||||
assert prompt.count("<img></img>") == len(assets)
|
||||
|
||||
# Replace the placeholders with local paths to the exported assets
|
||||
for asset in assets:
|
||||
image_tmp_path = tmp_path / f"{asset.name}.jpg"
|
||||
asset.pil_image.save(image_tmp_path)
|
||||
prompt = prompt.replace(
|
||||
"<img></img>",
|
||||
f"<img>{image_tmp_path}</img>",
|
||||
1,
|
||||
)
|
||||
return prompt
|
||||
|
||||
|
||||
####### Model-specific HuggingFace runner patchers
|
||||
def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for GLM4."""
|
||||
hf_processor = hf_model.processor
|
||||
|
||||
def processor(*args, text="", images=None, **kwargs):
|
||||
if isinstance(images, Image):
|
||||
images = [images]
|
||||
# inputs is a custom class instead of dict or BatchFeature
|
||||
inputs = hf_processor(
|
||||
*args,
|
||||
prompt=text,
|
||||
images=images,
|
||||
**kwargs,
|
||||
)
|
||||
inputs = {
|
||||
k: inputs[k]
|
||||
for k in inputs.keys() # noqa
|
||||
if k not in ("seq_lens", "sft_format")
|
||||
}
|
||||
return BatchFeature(data=inputs, tensor_type="pt")
|
||||
|
||||
hf_model.processor = processor
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language.model.embed_tokens
|
||||
return hf_model
|
||||
|
||||
|
||||
def gemma3_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for Gemma 3."""
|
||||
hf_processor = hf_model.processor
|
||||
|
||||
def processor(*args, **kwargs):
|
||||
return hf_processor(*args, do_pan_and_scan=True, **kwargs)
|
||||
|
||||
hf_model.processor = processor
|
||||
|
||||
return hf_model
|
||||
|
||||
|
||||
def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for GLM4V."""
|
||||
hf_processor = hf_model.processor
|
||||
patch_padding_side(hf_processor)
|
||||
|
||||
def processor(*args, text="", images=None, **kwargs):
|
||||
if images is None:
|
||||
return hf_processor(*args, **kwargs)
|
||||
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
|
||||
contents = re.findall(
|
||||
r"<\|begin_of_image\|><\|endoftext\|><\|end_of_image\|>(.*?)<\|assistant\|>",
|
||||
text,
|
||||
)
|
||||
assert len(contents) == len(images)
|
||||
|
||||
return hf_processor.apply_chat_template(
|
||||
[{
|
||||
"role": "user",
|
||||
"image": image,
|
||||
"content": content
|
||||
} for image, content in zip(images, contents)],
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hf_model.processor = processor
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.transformer.output_layer
|
||||
return hf_model
|
||||
|
||||
|
||||
def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for H2OVL."""
|
||||
|
||||
class H2OVLProcessor:
|
||||
"""A simple processor for H2OVL models."""
|
||||
|
||||
def __init__(self, hf_runner: HfRunner):
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.use_msac = self.config.use_msac
|
||||
self.min_num = self.config.min_dynamic_patch
|
||||
self.max_num = self.config.max_dynamic_patch
|
||||
self.image_size = self.vision_config.image_size
|
||||
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]],
|
||||
**kwargs):
|
||||
# yapf: disable
|
||||
from vllm.model_executor.models.h2ovl import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
|
||||
|
||||
# yapf: enable
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
pixel_values = [
|
||||
image_to_pixel_values_h2ovl(
|
||||
image,
|
||||
input_size=self.image_size,
|
||||
min_num=self.min_num,
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
use_msac=self.use_msac,
|
||||
) for image in images
|
||||
]
|
||||
num_patches_list = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values
|
||||
]
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
for num_patches in num_patches_list:
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = H2OVLProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for SkyworkR1V."""
|
||||
|
||||
class SkyworkR1VProcessor:
|
||||
"""A simple processor for SkyworkR1V."""
|
||||
|
||||
def __init__(self, hf_runner: HfRunner):
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.min_num = self.config.min_dynamic_patch
|
||||
self.max_num = self.config.max_dynamic_patch
|
||||
self.image_size = self.vision_config.image_size
|
||||
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]],
|
||||
**kwargs):
|
||||
from vllm.model_executor.models.skyworkr1v import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START,
|
||||
image_to_pixel_values_skyworkr1v)
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
pixel_values = [
|
||||
image_to_pixel_values_skyworkr1v(
|
||||
image,
|
||||
input_size=self.image_size,
|
||||
min_num=self.min_num,
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
) for image in images
|
||||
]
|
||||
num_patches_list = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values
|
||||
]
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
for num_patches in num_patches_list:
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = SkyworkR1VProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for InternVL."""
|
||||
|
||||
class InternVLProcessor:
|
||||
"""A simple processor for InternVL2 which misses a processor."""
|
||||
|
||||
def __init__(self, hf_runner: HfRunner):
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.min_num = self.config.min_dynamic_patch
|
||||
self.max_num = self.config.max_dynamic_patch
|
||||
self.image_size = self.vision_config.image_size
|
||||
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]],
|
||||
**kwargs):
|
||||
from vllm.model_executor.models.internvl import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START,
|
||||
image_to_pixel_values_internvl)
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
pixel_values = [
|
||||
image_to_pixel_values_internvl(
|
||||
image,
|
||||
input_size=self.image_size,
|
||||
min_num=self.min_num,
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
) for image in images
|
||||
]
|
||||
num_patches_list = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values
|
||||
]
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
for num_patches in num_patches_list:
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = InternVLProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
def _internvl_generate(
|
||||
self,
|
||||
pixel_values: torch.FloatTensor,
|
||||
input_ids: torch.FloatTensor,
|
||||
attention_mask: Optional[torch.LongTensor] = None,
|
||||
**generate_kwargs,
|
||||
) -> torch.LongTensor:
|
||||
"""Generate method for InternVL2 model without fixed use_cache."""
|
||||
assert self.img_context_token_id is not None
|
||||
target_dtype = next(self.parameters()).dtype
|
||||
vit_embeds = self.extract_feature(pixel_values.to(target_dtype))
|
||||
input_embeds = self.language_model.get_input_embeddings()(input_ids)
|
||||
B, N, C = input_embeds.shape
|
||||
input_embeds = input_embeds.reshape(B * N, C)
|
||||
|
||||
input_ids = input_ids.reshape(B * N)
|
||||
selected = (input_ids == self.img_context_token_id)
|
||||
assert selected.sum() != 0
|
||||
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
|
||||
|
||||
input_embeds = input_embeds.reshape(B, N, C)
|
||||
|
||||
forward_kwargs = dict(
|
||||
inputs_embeds=input_embeds,
|
||||
attention_mask=attention_mask,
|
||||
)
|
||||
if getattr(self, "use_visual_token_mask", False):
|
||||
visual_token_mask = selected.reshape(B, N, 1).to(input_embeds.dtype)
|
||||
forward_kwargs["visual_token_mask"] = visual_token_mask
|
||||
outputs = self.language_model.generate(
|
||||
**forward_kwargs,
|
||||
**generate_kwargs,
|
||||
)
|
||||
|
||||
return outputs
|
||||
|
||||
|
||||
def mantis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
from mantis.models.mllava import MLlavaProcessor
|
||||
|
||||
hf_model.processor = MLlavaProcessor.from_pretrained(hf_model.model_name)
|
||||
|
||||
orig_generate = hf_model.model.generate
|
||||
tokenizer = hf_model.processor.tokenizer
|
||||
|
||||
def _generate(self, *args, **kwargs):
|
||||
return orig_generate(
|
||||
*args,
|
||||
**kwargs,
|
||||
eos_token_id=[
|
||||
tokenizer.eos_token_id,
|
||||
tokenizer.convert_tokens_to_ids("<|eot_id|>"),
|
||||
],
|
||||
)
|
||||
|
||||
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
|
||||
|
||||
return hf_model
|
||||
|
||||
|
||||
def minicpmv_25_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
orig_generate = hf_model.model.generate
|
||||
|
||||
def _generate(
|
||||
self,
|
||||
*args,
|
||||
input_ids=None,
|
||||
pixel_values=None,
|
||||
image_sizes=None,
|
||||
image_bound=None,
|
||||
tgt_sizes=None,
|
||||
**kwargs,
|
||||
):
|
||||
model_inputs = {
|
||||
"input_ids": input_ids,
|
||||
"pixel_values": pixel_values,
|
||||
"image_sizes": image_sizes,
|
||||
"image_bound": image_bound,
|
||||
"tgt_sizes": tgt_sizes,
|
||||
}
|
||||
for k in list(model_inputs.keys()):
|
||||
if model_inputs[k] is None:
|
||||
model_inputs.pop(k)
|
||||
|
||||
return orig_generate(model_inputs, *args, decode_text=False, **kwargs)
|
||||
|
||||
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
|
||||
|
||||
return hf_model
|
||||
|
||||
|
||||
def minicpmo_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
orig_generate = hf_model.model.generate
|
||||
|
||||
def _generate(self, *args, image_sizes=None, **kwargs):
|
||||
return orig_generate(*args, decode_text=False, **kwargs)
|
||||
|
||||
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
|
||||
|
||||
return hf_model
|
||||
|
||||
|
||||
def minicpmv_26_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
orig_generate = hf_model.model.generate
|
||||
|
||||
def _generate(self, *args, image_sizes=None, **kwargs):
|
||||
return orig_generate(*args, decode_text=False, **kwargs)
|
||||
|
||||
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
|
||||
|
||||
return hf_model
|
||||
|
||||
|
||||
def minimax_vl_01_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
orig_generate = hf_model.model.generate
|
||||
|
||||
def _generate(self, *args, image_sizes=None, **kwargs):
|
||||
return orig_generate(*args, decode_text=False, **kwargs)
|
||||
|
||||
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
|
||||
|
||||
return hf_model
|
||||
|
||||
|
||||
def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for Molmo."""
|
||||
hf_processor = hf_model.processor
|
||||
|
||||
def _processor(*args, **kwargs):
|
||||
return hf_processor.process(*args, **kwargs)
|
||||
|
||||
hf_model.processor = _processor
|
||||
|
||||
def _generate(self, max_new_tokens=None, do_sample=None, **kwargs):
|
||||
batch = {
|
||||
k: kwargs.pop(k).unsqueeze(0)
|
||||
for k in ("input_ids", "images", "image_input_idx", "image_masks")
|
||||
if k in kwargs
|
||||
}
|
||||
batch = BatchFeature(batch).to(dtype=self.dtype)
|
||||
|
||||
return self.generate_from_batch(
|
||||
batch,
|
||||
generation_config=GenerationConfig(
|
||||
max_new_tokens=max_new_tokens,
|
||||
stop_strings="<|endoftext|>",
|
||||
do_sample=do_sample,
|
||||
),
|
||||
**kwargs,
|
||||
)
|
||||
|
||||
hf_model.model.generate = types.MethodType(_generate, hf_model.model)
|
||||
|
||||
return hf_model
|
||||
|
||||
|
||||
def ovis2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
|
||||
hf_model.model.visual_tokenizer.to(hf_model.dtype)
|
||||
hf_model.model.vte.to(hf_model.dtype)
|
||||
hf_model.model.llm.to(hf_model.dtype)
|
||||
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.llm.get_output_embeddings()
|
||||
|
||||
def processor(*args, text="", images=None, **kwargs):
|
||||
text_tokenizer = hf_model.model.get_text_tokenizer()
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
|
||||
text = text.split("<|im_start|>user\n")[1].split("<|im_end|>\n")[0]
|
||||
|
||||
prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
|
||||
text_or_conversations=text, images=images)
|
||||
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
|
||||
|
||||
inputs = {
|
||||
"inputs": input_ids.unsqueeze(0),
|
||||
"pixel_values": pixel_values.unsqueeze(0),
|
||||
"attention_mask": attention_mask.unsqueeze(0),
|
||||
}
|
||||
return BatchFeature(data=inputs, tensor_type="pt")
|
||||
|
||||
hf_model.processor = processor
|
||||
return hf_model
|
||||
139
tests/models/multimodal/generation/vlm_utils/runners.py
Normal file
139
tests/models/multimodal/generation/vlm_utils/runners.py
Normal file
@@ -0,0 +1,139 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Entrypoints for wrapping the core run_test implementation for specific test
|
||||
types / modalities.
|
||||
"""
|
||||
from pathlib import PosixPath
|
||||
|
||||
from .....conftest import HfRunner, VllmRunner, _ImageAssets, _VideoAssets
|
||||
from . import builders, core
|
||||
from .types import ExpandableVLMTestArgs, VLMTestInfo
|
||||
|
||||
|
||||
####### Entrypoints for running different test types
|
||||
def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: _ImageAssets):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs = builders.build_single_image_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
runner_mm_key="images",
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
|
||||
|
||||
def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: _ImageAssets):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs = builders.build_multi_image_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": len(image_assets)},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
runner_mm_key="images",
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
|
||||
|
||||
def run_embedding_test(*, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: _ImageAssets):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
vllm_embeddings=vllm_embeddings,
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
runner_mm_key="images",
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
|
||||
|
||||
def run_video_test(
|
||||
*,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
video_assets: _VideoAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
assert test_case.num_video_frames is not None
|
||||
inputs = builders.build_video_inputs_from_test_info(
|
||||
model_test_info, video_assets, test_case.size_wrapper,
|
||||
test_case.num_video_frames)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"video": len(video_assets)},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
runner_mm_key="videos",
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
|
||||
|
||||
def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner]):
|
||||
# Custom test cases can provide inputs directly, but they need to
|
||||
# explicitly provided a CustomTestConfig, which wraps the inputs and
|
||||
# the limit_mm_per_prompt
|
||||
assert test_case.custom_test_opts is not None
|
||||
|
||||
inputs = test_case.custom_test_opts.inputs
|
||||
limit_mm_per_prompt = test_case.custom_test_opts.limit_mm_per_prompt
|
||||
runner_mm_key = test_case.custom_test_opts.runner_mm_key
|
||||
# Inputs, limit_mm_per_prompt, and runner_mm_key should all be set
|
||||
assert inputs is not None
|
||||
assert limit_mm_per_prompt is not None
|
||||
assert runner_mm_key is not None
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
vllm_runner=vllm_runner,
|
||||
inputs=inputs,
|
||||
model=test_case.model,
|
||||
dtype=test_case.dtype,
|
||||
max_tokens=test_case.max_tokens,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
runner_mm_key=runner_mm_key,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
188
tests/models/multimodal/generation/vlm_utils/types.py
Normal file
188
tests/models/multimodal/generation/vlm_utils/types.py
Normal file
@@ -0,0 +1,188 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
"""Types for writing multimodal model tests."""
|
||||
from collections.abc import Iterable
|
||||
from enum import Enum
|
||||
from pathlib import PosixPath
|
||||
from typing import Any, Callable, NamedTuple, Optional, Union
|
||||
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
from pytest import MarkDecorator
|
||||
from transformers import AutoModelForCausalLM
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
|
||||
from vllm.config import TaskOption
|
||||
from vllm.sequence import SampleLogprobs
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
from .....conftest import IMAGE_ASSETS, HfRunner, ImageAsset, _ImageAssets
|
||||
from ....utils import check_logprobs_close
|
||||
|
||||
# meta image tag; will be replaced by the appropriate tag for the model
|
||||
TEST_IMG_PLACEHOLDER = "<vlm_image>"
|
||||
TEST_VIDEO_PLACEHOLDER = "<vlm_video>"
|
||||
|
||||
# yapf: disable
|
||||
SINGLE_IMAGE_BASE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign": f"{TEST_IMG_PLACEHOLDER}What's the content of the image?",
|
||||
"cherry_blossom": f"{TEST_IMG_PLACEHOLDER}What is the season?",
|
||||
})
|
||||
|
||||
MULTI_IMAGE_BASE_PROMPT = f"Image-1: {TEST_IMG_PLACEHOLDER}Image-2: {TEST_IMG_PLACEHOLDER}Describe the two images in detail.\n" # noqa: E501
|
||||
VIDEO_BASE_PROMPT = f"{TEST_VIDEO_PLACEHOLDER}Why is this video funny?"
|
||||
|
||||
|
||||
IMAGE_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0), (0.25, 0.5, 1.0)]
|
||||
EMBEDDING_SIZE_FACTORS = [(), (1.0, ), (1.0, 1.0, 1.0)]
|
||||
RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
|
||||
# yapf: enable
|
||||
|
||||
|
||||
class VLMTestType(Enum):
|
||||
IMAGE = 1
|
||||
MULTI_IMAGE = 2
|
||||
EMBEDDING = 3
|
||||
VIDEO = 4
|
||||
CUSTOM_INPUTS = 5
|
||||
|
||||
|
||||
class SizeType(Enum):
|
||||
SIZE_FACTOR = 1
|
||||
FIXED_SIZE = 2
|
||||
|
||||
|
||||
class CustomTestOptions(NamedTuple):
|
||||
inputs: list[tuple[list[str], list[Union[list[Image], Image]]]]
|
||||
limit_mm_per_prompt: dict[str, int]
|
||||
# kwarg to pass multimodal data in as to vllm/hf runner instances.
|
||||
runner_mm_key: str = "images"
|
||||
|
||||
|
||||
class ImageSizeWrapper(NamedTuple):
|
||||
type: SizeType
|
||||
# A size factor is a wrapper of 0+ floats,
|
||||
# while a fixed size contains an iterable of integer pairs
|
||||
data: Union[Iterable[float], Iterable[tuple[int, int]]]
|
||||
|
||||
|
||||
class VLMTestInfo(NamedTuple):
|
||||
"""Holds the configuration for 1+ tests for one model architecture."""
|
||||
|
||||
models: list[str]
|
||||
test_type: Union[VLMTestType, Iterable[VLMTestType]]
|
||||
|
||||
# Should be None only if this is a CUSTOM_INPUTS test
|
||||
prompt_formatter: Optional[Callable[[str], str]] = None
|
||||
img_idx_to_prompt: Callable[[int], str] = lambda idx: "<image>\n"
|
||||
video_idx_to_prompt: Callable[[int], str] = lambda idx: "<video>\n"
|
||||
|
||||
# Most models work on the single / multi-image prompts above, but in some
|
||||
# cases the log prob check fails, e.g., for paligemma. We allow passing
|
||||
# an override for the single image prompts / multi-image prompt for this
|
||||
# reason.
|
||||
single_image_prompts: Iterable[str] = SINGLE_IMAGE_BASE_PROMPTS
|
||||
multi_image_prompt: str = MULTI_IMAGE_BASE_PROMPT
|
||||
|
||||
# Function for converting ImageAssets to image embeddings;
|
||||
# We need to define this explicitly for embedding tests
|
||||
convert_assets_to_embeddings: Optional[Callable[[_ImageAssets],
|
||||
torch.Tensor]] = None
|
||||
|
||||
# Exposed options for vLLM runner; we change these in a several tests,
|
||||
# but the defaults are derived from VllmRunner & the engine defaults
|
||||
# These settings are chosen to avoid OOMs when running in the CI
|
||||
enforce_eager: bool = True
|
||||
max_model_len: int = 1024
|
||||
max_num_seqs: int = 256
|
||||
task: TaskOption = "auto"
|
||||
tensor_parallel_size: int = 1
|
||||
vllm_runner_kwargs: Optional[dict[str, Any]] = None
|
||||
|
||||
# Optional callable which gets a list of token IDs from the model tokenizer
|
||||
get_stop_token_ids: Optional[Callable[[AnyTokenizer], list[int]]] = None
|
||||
# Optional list of strings to stop generation, useful when stop tokens are
|
||||
# not special tokens in the tokenizer
|
||||
stop_str: Optional[list[str]] = None
|
||||
|
||||
# Exposed options for HF runner
|
||||
hf_model_kwargs: Optional[dict[str, Any]] = None
|
||||
# Indicates we should explicitly pass the EOS from the tokenizer
|
||||
use_tokenizer_eos: bool = False
|
||||
auto_cls: type[_BaseAutoModelClass] = AutoModelForCausalLM
|
||||
patch_hf_runner: Optional[Callable[[HfRunner], HfRunner]] = None
|
||||
|
||||
# Post processors that if defined, will run oun the outputs of the
|
||||
# vLLM and HF runner, respectively (useful for sanitization, etc).
|
||||
vllm_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
|
||||
hf_output_post_proc: Optional[Callable[[RunnerOutput, str], Any]] = None
|
||||
|
||||
# Consumes the output of the callables above and checks if they're equal
|
||||
comparator: Callable[..., None] = check_logprobs_close
|
||||
|
||||
# Default expandable params per test; these defaults can be overridden in
|
||||
# instances of this object; the complete set of test cases for the model
|
||||
# is all combinations of .models + all fields below
|
||||
max_tokens: Union[int, tuple[int]] = 128
|
||||
num_logprobs: Union[int, tuple[int]] = 5
|
||||
dtype: Union[str, Union[list[str], tuple[str, ...]]] = "auto"
|
||||
distributed_executor_backend: Optional[Union[str, Iterable[str]]] = None
|
||||
# Only expanded in video tests
|
||||
num_video_frames: Union[int, tuple[int]] = 16
|
||||
|
||||
# Fixed image sizes / image size factors; most tests use image_size_factors
|
||||
# The values provided for these two fields will be stacked and expanded
|
||||
# such that each model will consider each image size factor / image size
|
||||
# once per tests (much like concatenating and wrapping in one parametrize
|
||||
# call)
|
||||
image_size_factors: Iterable[Iterable[float]] = IMAGE_SIZE_FACTORS
|
||||
image_sizes: Optional[Iterable[Iterable[tuple[int, int]]]] = None
|
||||
|
||||
# Hack for updating a prompt to take into a local path; currently only used
|
||||
# for Qwen-VL, which requires encoding the image path / url into the prompt
|
||||
# for HF runner
|
||||
prompt_path_encoder: Optional[
|
||||
Callable[[PosixPath, str, Union[list[ImageAsset], _ImageAssets]],
|
||||
str]] = None # noqa: E501
|
||||
|
||||
# Allows configuring a test to run with custom inputs
|
||||
custom_test_opts: Optional[list[CustomTestOptions]] = None
|
||||
|
||||
marks: Optional[list[MarkDecorator]] = None
|
||||
|
||||
def get_non_parametrized_runner_kwargs(self):
|
||||
"""Returns a dictionary of expandable kwargs for items that are used
|
||||
in all test types, which are NOT used when creating the parametrized
|
||||
test cases.
|
||||
"""
|
||||
return {
|
||||
"enforce_eager": self.enforce_eager,
|
||||
"max_model_len": self.max_model_len,
|
||||
"max_num_seqs": self.max_num_seqs,
|
||||
"task": self.task,
|
||||
"tensor_parallel_size": self.tensor_parallel_size,
|
||||
"vllm_runner_kwargs": self.vllm_runner_kwargs,
|
||||
"hf_output_post_proc": self.hf_output_post_proc,
|
||||
"vllm_output_post_proc": self.vllm_output_post_proc,
|
||||
"auto_cls": self.auto_cls,
|
||||
"use_tokenizer_eos": self.use_tokenizer_eos,
|
||||
"comparator": self.comparator,
|
||||
"get_stop_token_ids": self.get_stop_token_ids,
|
||||
"hf_model_kwargs": self.hf_model_kwargs,
|
||||
"stop_str": self.stop_str,
|
||||
"patch_hf_runner": self.patch_hf_runner,
|
||||
}
|
||||
|
||||
|
||||
class ExpandableVLMTestArgs(NamedTuple):
|
||||
"""The expanded kwargs which correspond to a single test case."""
|
||||
model: str
|
||||
max_tokens: int
|
||||
num_logprobs: int
|
||||
dtype: str
|
||||
distributed_executor_backend: Optional[str]
|
||||
# Sizes are used for everything except for custom input tests
|
||||
size_wrapper: Optional[ImageSizeWrapper] = None
|
||||
# Video only
|
||||
num_video_frames: Optional[int] = None
|
||||
# Custom inputs only
|
||||
custom_test_opts: Optional[CustomTestOptions] = None
|
||||
Reference in New Issue
Block a user