[CI/Build] Move model-specific multi-modal processing tests (#11934)
Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk>
This commit is contained in:
0
tests/models/multimodal/processing/__init__.py
Normal file
0
tests/models/multimodal/processing/__init__.py
Normal file
201
tests/models/multimodal/processing/test_common.py
Normal file
201
tests/models/multimodal/processing/test_common.py
Normal file
@@ -0,0 +1,201 @@
|
||||
from functools import partial
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.inputs import InputProcessingContext
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.processing import ProcessingCache
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
|
||||
from ....multimodal.utils import random_audio, random_image, random_video
|
||||
|
||||
|
||||
def _test_processing_correctness(
|
||||
model_id: str,
|
||||
modalities: dict[str, bool],
|
||||
hit_rate: float,
|
||||
num_batches: int,
|
||||
simplify_rate: float,
|
||||
):
|
||||
if model_id == "TIGER-Lab/Mantis-8B-siglip-llama3":
|
||||
hf_overrides = {"architectures": ["MantisForConditionalGeneration"]}
|
||||
else:
|
||||
hf_overrides = {}
|
||||
|
||||
limit_mm_per_prompt = {
|
||||
modality: 3 if supports_multi else 1
|
||||
for modality, supports_multi in modalities.items()
|
||||
}
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
task="auto",
|
||||
tokenizer=model_id,
|
||||
tokenizer_mode="auto",
|
||||
trust_remote_code=True,
|
||||
seed=0,
|
||||
dtype="float16",
|
||||
revision=None,
|
||||
hf_overrides=hf_overrides,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
)
|
||||
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
|
||||
ctx = InputProcessingContext(
|
||||
model_config,
|
||||
tokenizer=cached_get_tokenizer(model_config.tokenizer),
|
||||
)
|
||||
# Ensure that it can fit all of the data
|
||||
cache = ProcessingCache(capacity=1 << 30)
|
||||
|
||||
baseline_processor = factories.build_processor(ctx, cache=None)
|
||||
cached_processor = factories.build_processor(ctx, cache=cache)
|
||||
dummy_inputs = baseline_processor.dummy_inputs
|
||||
tokenizer = baseline_processor.info.get_tokenizer()
|
||||
|
||||
rng = np.random.RandomState(0)
|
||||
|
||||
input_to_hit = {
|
||||
"image": Image.new("RGB", size=(128, 128)),
|
||||
"video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
|
||||
"audio": (np.zeros((512, )), 16000),
|
||||
}
|
||||
input_factory = {
|
||||
"image":
|
||||
partial(random_image, rng, min_wh=128, max_wh=256),
|
||||
"video":
|
||||
partial(random_video,
|
||||
rng,
|
||||
min_frames=2,
|
||||
max_frames=8,
|
||||
min_wh=128,
|
||||
max_wh=256),
|
||||
"audio":
|
||||
partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
|
||||
}
|
||||
|
||||
for batch_idx in range(num_batches):
|
||||
mm_data = {
|
||||
k:
|
||||
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
|
||||
for _ in range(rng.randint(limit_mm_per_prompt[k]))]
|
||||
for k in modalities
|
||||
}
|
||||
|
||||
mm_counts = {k: len(vs) for k, vs in mm_data.items()}
|
||||
prompt = dummy_inputs.get_dummy_processor_inputs(
|
||||
model_config.max_model_len,
|
||||
mm_counts,
|
||||
).prompt_text
|
||||
|
||||
# Drop unnecessary keys and test single -> multi conversion
|
||||
if rng.rand() < simplify_rate:
|
||||
for k in list(mm_data.keys()):
|
||||
if not mm_data[k]:
|
||||
del mm_data[k]
|
||||
elif len(mm_data[k]) == 1:
|
||||
mm_data[k] = mm_data[k][0]
|
||||
|
||||
baseline_result = baseline_processor.apply(
|
||||
prompt,
|
||||
mm_data=mm_data,
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
cached_result = cached_processor.apply(
|
||||
prompt,
|
||||
mm_data=mm_data,
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
|
||||
assert baseline_result == cached_result, (
|
||||
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
|
||||
|
||||
baseline_tokenized_result = baseline_processor.apply(
|
||||
tokenizer.encode(prompt),
|
||||
mm_data=mm_data,
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
|
||||
assert baseline_result == baseline_tokenized_result, (
|
||||
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
|
||||
|
||||
cached_tokenized_result = cached_processor.apply(
|
||||
tokenizer.encode(prompt),
|
||||
mm_data=mm_data,
|
||||
hf_processor_mm_kwargs={},
|
||||
)
|
||||
|
||||
assert cached_result == cached_tokenized_result, (
|
||||
f"Failed ({batch_idx=}, {prompt=}, {mm_data=})")
|
||||
|
||||
|
||||
# yapf: disable
|
||||
# True if the model supports multiple data items of the modality per request
|
||||
@pytest.mark.parametrize(("model_id", "modalities"), [
|
||||
("rhymes-ai/Aria", {"image": True}),
|
||||
("Salesforce/blip2-opt-2.7b", {"image": False}),
|
||||
("facebook/chameleon-7b", {"image": False}),
|
||||
("adept/fuyu-8b", {"image": False}),
|
||||
("llava-hf/llava-1.5-7b-hf", {"image": True}),
|
||||
("llava-hf/llava-v1.6-mistral-7b-hf", {"image": True}),
|
||||
("llava-hf/LLaVA-NeXT-Video-7B-hf", {"video": False}),
|
||||
("llava-hf/llava-onevision-qwen2-0.5b-ov-hf", {"image": True, "video": True}), # noqa: E501
|
||||
("TIGER-Lab/Mantis-8B-siglip-llama3", {"image": True}),
|
||||
("mistral-community/pixtral-12b", {"image": True}),
|
||||
("Qwen/Qwen2-VL-2B-Instruct", {"image": True, "video": True}),
|
||||
("Qwen/Qwen2-Audio-7B-Instruct", {"audio": True}),
|
||||
("fixie-ai/ultravox-v0_3", {"audio": True}),
|
||||
])
|
||||
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
|
||||
@pytest.mark.parametrize("num_batches", [32])
|
||||
@pytest.mark.parametrize("simplify_rate", [1.0])
|
||||
# yapf: enable
|
||||
def test_processing_correctness(
|
||||
model_id: str,
|
||||
modalities: dict[str, bool],
|
||||
hit_rate: float,
|
||||
num_batches: int,
|
||||
simplify_rate: float,
|
||||
):
|
||||
_test_processing_correctness(
|
||||
model_id,
|
||||
modalities,
|
||||
hit_rate=hit_rate,
|
||||
num_batches=num_batches,
|
||||
simplify_rate=simplify_rate,
|
||||
)
|
||||
|
||||
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(("model_id", "modalities"), [
|
||||
("microsoft/Phi-3-vision-128k-instruct", {"image": True}),
|
||||
])
|
||||
@pytest.mark.parametrize("hit_rate", [0.3, 0.5, 1.0])
|
||||
@pytest.mark.parametrize("num_batches", [32])
|
||||
@pytest.mark.parametrize("simplify_rate", [1.0])
|
||||
# yapf: enable
|
||||
def test_processing_correctness_phi3v(
|
||||
model_id: str,
|
||||
modalities: dict[str, bool],
|
||||
hit_rate: float,
|
||||
num_batches: int,
|
||||
simplify_rate: float,
|
||||
):
|
||||
# HACK - this is an attempted workaround for the following bug
|
||||
# https://github.com/huggingface/transformers/issues/34307
|
||||
from transformers import AutoImageProcessor # noqa: F401
|
||||
from transformers import AutoProcessor # noqa: F401
|
||||
|
||||
AutoImageProcessor.from_pretrained(model_id, trust_remote_code=True)
|
||||
|
||||
_test_processing_correctness(
|
||||
model_id,
|
||||
modalities,
|
||||
hit_rate=hit_rate,
|
||||
num_batches=num_batches,
|
||||
simplify_rate=simplify_rate,
|
||||
)
|
||||
178
tests/models/multimodal/processing/test_idefics3.py
Normal file
178
tests/models/multimodal/processing/test_idefics3.py
Normal file
@@ -0,0 +1,178 @@
|
||||
"""Tests for Idefics3's multimodal preprocessing kwargs."""
|
||||
from typing import Optional
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import AutoImageProcessor, AutoTokenizer
|
||||
|
||||
from vllm.inputs import InputContext, token_inputs
|
||||
from vllm.multimodal import MultiModalRegistry
|
||||
|
||||
from ....conftest import _ImageAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
models = ["HuggingFaceM4/Idefics3-8B-Llama3"]
|
||||
|
||||
|
||||
# Wrap lazy imports to avoid initializing CUDA during test collection
|
||||
@pytest.fixture()
|
||||
def input_processor_for_idefics3():
|
||||
from vllm.model_executor.models.idefics3 import (
|
||||
input_processor_for_idefics3)
|
||||
return input_processor_for_idefics3
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def dummy_data_for_idefics3():
|
||||
from vllm.model_executor.models.idefics3 import dummy_data_for_idefics3
|
||||
return dummy_data_for_idefics3
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def get_max_idefics3_image_tokens():
|
||||
from vllm.model_executor.models.idefics3 import (
|
||||
get_max_idefics3_image_tokens)
|
||||
return get_max_idefics3_image_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("longest_edge", [None, 168, 336, 400, 2 * 336])
|
||||
def test_input_mapper_override(model: str, image_assets: _ImageAssets,
|
||||
longest_edge: Optional[int]):
|
||||
"""Ensure that the [default] input mapper handles size properly."""
|
||||
|
||||
mm_processor_kwargs = {
|
||||
"size": {
|
||||
"longest_edge": longest_edge
|
||||
}
|
||||
} if longest_edge is not None else {}
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
)
|
||||
|
||||
hf_processor = AutoImageProcessor.from_pretrained(model,
|
||||
trust_remote_code=True,
|
||||
**mm_processor_kwargs)
|
||||
|
||||
mm_registry = MultiModalRegistry()
|
||||
mm_registry.init_mm_limits_per_prompt(ctx.model_config)
|
||||
|
||||
image = image_assets[0].pil_image
|
||||
hf_result = hf_processor.preprocess(
|
||||
image,
|
||||
return_tensors="pt",
|
||||
)
|
||||
|
||||
vllm_result = mm_registry.map_input(
|
||||
ctx.model_config,
|
||||
{"image": image},
|
||||
)
|
||||
|
||||
assert torch.all(hf_result["pixel_values"] == vllm_result["pixel_values"])
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("longest_edge, expected_max_tokens", [
|
||||
(None, 2873),
|
||||
(168, 169),
|
||||
(336, 169),
|
||||
(400, 338),
|
||||
(672, 338),
|
||||
])
|
||||
def test_max_tokens_override(get_max_idefics3_image_tokens, model: str,
|
||||
longest_edge: Optional[int],
|
||||
expected_max_tokens: int):
|
||||
"""Ensure get_max_idefics3_image_tokens handles mm_processor_kwargs."""
|
||||
size = {"longest_edge": longest_edge} if longest_edge is not None else None
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=None,
|
||||
)
|
||||
|
||||
actual_max_tokens = get_max_idefics3_image_tokens(
|
||||
ctx=InputContext(ctx.model_config),
|
||||
size=size,
|
||||
)
|
||||
|
||||
assert expected_max_tokens == actual_max_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("longest_edge, toks_per_img, num_imgs", [
|
||||
(168, 169, 1),
|
||||
(168, 169, 2),
|
||||
(400, 338, 1),
|
||||
(400, 338, 2),
|
||||
])
|
||||
def test_dummy_data_override(dummy_data_for_idefics3, model: str,
|
||||
longest_edge: int, toks_per_img: int,
|
||||
num_imgs: int):
|
||||
"""Ensure dummy_data_for_idefics3 handles num_crops properly."""
|
||||
# Same as the previous test - don't initialize mm_processor_kwargs
|
||||
# in this test and assume that the kwargs will be correctly expanded by
|
||||
# the partial when calling the dummy data func.
|
||||
size = {"longest_edge": longest_edge} if longest_edge is not None else None
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=None,
|
||||
)
|
||||
|
||||
dummy_data = dummy_data_for_idefics3(
|
||||
ctx=ctx,
|
||||
seq_len=8192, # Should be bigger than num_imgs * toks_per_img
|
||||
mm_counts={"image": num_imgs},
|
||||
size=size)
|
||||
sequence_data = dummy_data.seq_data
|
||||
# Ensure we have the right number of placeholders per size
|
||||
image_token_id = ctx.get_hf_config().image_token_id
|
||||
img_tok_count = sequence_data.get_token_ids().count(image_token_id)
|
||||
assert img_tok_count == toks_per_img * num_imgs
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("longest_edge,expected_toks_per_img,num_imgs", [
|
||||
(336, 169 * (1**2 + 1), 1),
|
||||
(336, 169 * (1**2 + 1), 2),
|
||||
(400, 169 * (2**2 + 1), 1),
|
||||
(400, 169 * (2**2 + 1), 2),
|
||||
])
|
||||
def test_input_processor_override(input_processor_for_idefics3,
|
||||
image_assets: _ImageAssets, model: str,
|
||||
longest_edge: int,
|
||||
expected_toks_per_img: int, num_imgs: int):
|
||||
"""Ensure input_processor_for_idefics3 handles num_crops properly."""
|
||||
# Same as the previous test - don't initialize mm_processor_kwargs
|
||||
# in this test and assume that the kwargs will be correctly expanded by
|
||||
# the partial when calling the custom input processor.
|
||||
size = {"longest_edge": longest_edge} if longest_edge is not None else None
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=None,
|
||||
)
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
placeholders = "<image>" if num_imgs == 1 else "\n".join(
|
||||
f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
|
||||
prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
|
||||
images = [image_assets[0].pil_image.resize((336 * 4, 336 * 4))] * num_imgs
|
||||
|
||||
inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
|
||||
prompt=prompt,
|
||||
multi_modal_data={"image": images})
|
||||
|
||||
processed_inputs = input_processor_for_idefics3(ctx, inputs, size=size)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = ctx.get_hf_config().image_token_id
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
206
tests/models/multimodal/processing/test_internvl.py
Normal file
206
tests/models/multimodal/processing/test_internvl.py
Normal file
@@ -0,0 +1,206 @@
|
||||
"""Tests for InternVL's multimodal preprocessing kwargs."""
|
||||
from typing import Callable, Optional
|
||||
|
||||
import pytest
|
||||
from transformers import AutoTokenizer
|
||||
|
||||
from vllm.inputs import InputContext, token_inputs
|
||||
from vllm.multimodal import MultiModalRegistry
|
||||
|
||||
from ....conftest import _ImageAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
models = ["OpenGVLab/InternVL2-2B"]
|
||||
|
||||
|
||||
# Wrap lazy imports to avoid initializing CUDA during test collection
|
||||
@pytest.fixture()
|
||||
def input_processor_for_internvl():
|
||||
from vllm.model_executor.models.internvl import InternVLInputPipeline
|
||||
|
||||
pipeline = InternVLInputPipeline('<img>', '</img>', '<IMG_CONTEXT>')
|
||||
return pipeline.input_processor
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def dummy_data_for_internvl():
|
||||
from vllm.model_executor.models.internvl import InternVLInputPipeline
|
||||
|
||||
pipeline = InternVLInputPipeline('<img>', '</img>', '<IMG_CONTEXT>')
|
||||
return pipeline.dummy_data
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def get_max_internvl_image_tokens():
|
||||
from vllm.model_executor.models.internvl import (
|
||||
get_max_internvl_image_tokens)
|
||||
return get_max_internvl_image_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("max_dynamic_patch", [1, 4])
|
||||
@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
|
||||
def test_input_mapper_override(
|
||||
model: str,
|
||||
image_assets: _ImageAssets,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: Optional[bool],
|
||||
):
|
||||
mm_processor_kwargs = {
|
||||
"max_dynamic_patch": max_dynamic_patch,
|
||||
}
|
||||
if dynamic_image_size is not None:
|
||||
mm_processor_kwargs["dynamic_image_size"] = dynamic_image_size
|
||||
|
||||
expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
|
||||
if dynamic_image_size is False:
|
||||
expected_num_patches = 1
|
||||
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
)
|
||||
|
||||
mm_registry = MultiModalRegistry()
|
||||
mm_registry.init_mm_limits_per_prompt(ctx.model_config)
|
||||
|
||||
image = image_assets[0].pil_image.resize((448 * 2, 448 * 2))
|
||||
vllm_result = mm_registry.map_input(
|
||||
ctx.model_config,
|
||||
{"image": image},
|
||||
)
|
||||
assert vllm_result["pixel_values"].size(1) == expected_num_patches
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None])
|
||||
@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
|
||||
def test_max_tokens_override(
|
||||
get_max_internvl_image_tokens: Callable,
|
||||
model: str,
|
||||
max_dynamic_patch: Optional[int],
|
||||
dynamic_image_size: Optional[bool],
|
||||
):
|
||||
"""Ensure get_max_internvl_image_tokens handles mm_processor_kwargs."""
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=None,
|
||||
)
|
||||
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch
|
||||
expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
|
||||
if dynamic_image_size is False:
|
||||
expected_num_patches = 1
|
||||
expected_max_tokens = 256 * expected_num_patches
|
||||
|
||||
actual_max_tokens = get_max_internvl_image_tokens(
|
||||
ctx=InputContext(ctx.model_config),
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
assert expected_max_tokens == actual_max_tokens
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("max_dynamic_patch", [1, 4, None])
|
||||
@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
|
||||
def test_dummy_data_override(
|
||||
dummy_data_for_internvl: Callable,
|
||||
model: str,
|
||||
num_imgs: int,
|
||||
max_dynamic_patch: Optional[int],
|
||||
dynamic_image_size: Optional[bool],
|
||||
):
|
||||
"""Ensure dummy_data_for_internvl handles kwargs properly."""
|
||||
# Same as the previous test - don't initialize mm_processor_kwargs
|
||||
# in this test and assume that the kwargs will be correctly expanded by
|
||||
# the partial when calling the dummy data func.
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=None,
|
||||
)
|
||||
|
||||
if max_dynamic_patch is None:
|
||||
max_dynamic_patch = ctx.get_hf_config().max_dynamic_patch
|
||||
expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
|
||||
if dynamic_image_size is False:
|
||||
expected_num_patches = 1
|
||||
expected_max_tokens = 256 * expected_num_patches
|
||||
|
||||
dummy_data = dummy_data_for_internvl(
|
||||
ctx=ctx,
|
||||
seq_len=8192, # Should be bigger than num_imgs * toks_per_img
|
||||
mm_counts={"image": num_imgs},
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
sequence_data = dummy_data.seq_data
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
|
||||
image_token_id = tokenizer.encode('<IMG_CONTEXT>',
|
||||
add_special_tokens=False)[0]
|
||||
|
||||
# Ensure we have the right number of placeholders per size
|
||||
img_tok_count = sequence_data.get_token_ids().count(image_token_id)
|
||||
assert img_tok_count == expected_max_tokens * num_imgs
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", models)
|
||||
@pytest.mark.parametrize("max_dynamic_patch", [1, 4])
|
||||
@pytest.mark.parametrize("dynamic_image_size", [True, False, None])
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
def test_input_processor_override(
|
||||
input_processor_for_internvl: Callable,
|
||||
image_assets: _ImageAssets,
|
||||
model: str,
|
||||
num_imgs: int,
|
||||
max_dynamic_patch: int,
|
||||
dynamic_image_size: Optional[bool],
|
||||
):
|
||||
"""Ensure input_processor_for_internvl handles kwargs properly."""
|
||||
# Same as the previous test - don't initialize mm_processor_kwargs
|
||||
# in this test and assume that the kwargs will be correctly expanded by
|
||||
# the partial when calling the custom input processor.
|
||||
expected_num_patches = max_dynamic_patch + 1 if max_dynamic_patch > 1 else 1
|
||||
if dynamic_image_size is False:
|
||||
expected_num_patches = 1
|
||||
|
||||
ctx = build_model_context(
|
||||
model_name=model,
|
||||
tokenizer_name=model,
|
||||
trust_remote_code=True,
|
||||
mm_processor_kwargs=None,
|
||||
)
|
||||
expected_toks_per_img = 256 * expected_num_patches
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
tokenizer = AutoTokenizer.from_pretrained(model, trust_remote_code=True)
|
||||
placeholders = "<image>" if num_imgs == 1 else "\n".join(
|
||||
f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
|
||||
prompt = placeholders
|
||||
images = [image_assets[0].pil_image.resize((448 * 2, 448 * 2))] * num_imgs
|
||||
|
||||
inputs = token_inputs(prompt_token_ids=tokenizer.encode(prompt),
|
||||
prompt=prompt,
|
||||
multi_modal_data={"image": images})
|
||||
|
||||
processed_inputs = input_processor_for_internvl(
|
||||
ctx,
|
||||
inputs,
|
||||
max_dynamic_patch=max_dynamic_patch,
|
||||
dynamic_image_size=dynamic_image_size,
|
||||
)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.encode('<IMG_CONTEXT>',
|
||||
add_special_tokens=False)[0]
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
132
tests/models/multimodal/processing/test_llava_next.py
Normal file
132
tests/models/multimodal/processing/test_llava_next.py
Normal file
@@ -0,0 +1,132 @@
|
||||
import itertools
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
from pqdm.threads import pqdm
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.parse import ImageSize
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
def _validate_image_prompt_replacements_one(
|
||||
processor: BaseMultiModalProcessor,
|
||||
num_imgs: int,
|
||||
failed_size_excs: list[tuple[ImageSize, Exception]],
|
||||
image_size: ImageSize,
|
||||
) -> None:
|
||||
prompt = "<image>" * num_imgs
|
||||
image = Image.new("RGB", size=image_size)
|
||||
mm_data = {"image": [image] * num_imgs}
|
||||
|
||||
try:
|
||||
# The processor will throw an error if there is a mismatch
|
||||
# in the prompt replacements
|
||||
processed_inputs = processor.apply(prompt, mm_data, {})
|
||||
|
||||
image_placeholders = processed_inputs["mm_placeholders"]["image"]
|
||||
assert len(image_placeholders) == num_imgs
|
||||
|
||||
first_placeholder = image_placeholders[0]
|
||||
|
||||
# NOTE: There is a BOS token
|
||||
assert first_placeholder["offset"] == 1
|
||||
assert first_placeholder["length"] == (
|
||||
len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
|
||||
|
||||
except Exception as exc:
|
||||
failed_size_excs.append((image_size, exc))
|
||||
|
||||
|
||||
def _test_image_prompt_replacements(
|
||||
processor,
|
||||
*,
|
||||
num_imgs: int,
|
||||
image_sizes: list[ImageSize],
|
||||
) -> None:
|
||||
"""
|
||||
Ensure LlavaNextMultiModalProcessor
|
||||
handles prompt replacement properly for input images.
|
||||
"""
|
||||
failed_size_excs = list[tuple[ImageSize, Exception]]()
|
||||
|
||||
validate_one = partial(
|
||||
_validate_image_prompt_replacements_one,
|
||||
processor,
|
||||
num_imgs,
|
||||
failed_size_excs,
|
||||
)
|
||||
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" \
|
||||
+ "\n========\n".join(f"[{size}]\n{exc}"
|
||||
for size, exc in failed_size_excs)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
|
||||
)
|
||||
|
||||
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
|
||||
(488, 183), (2560, 1669)]
|
||||
image_sizes = [
|
||||
size for w, h in image_ratios
|
||||
for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
]
|
||||
|
||||
_test_image_prompt_replacements(
|
||||
processor,
|
||||
num_imgs=num_imgs,
|
||||
image_sizes=image_sizes,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip("This test takes around 2 hours to run. "
|
||||
"Comment this out to run it manually.")
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||
@pytest.mark.parametrize("num_imgs", [1])
|
||||
def test_processor_prompt_replacements_all(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
|
||||
)
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
image_sizes = list[ImageSize]()
|
||||
|
||||
# The aspect ratio of the grid layout is between 1 and 2
|
||||
# NOTE: Assumes that feature size calculation is the same if we
|
||||
# swap the width and height of the image
|
||||
for w, h in itertools.product(range(64, 1024), repeat=2):
|
||||
aspect_ratio = w / h
|
||||
if 1 <= aspect_ratio <= 2 and aspect_ratio not in seen_aspect_ratios:
|
||||
image_sizes.append(ImageSize(w, h))
|
||||
seen_aspect_ratios.add(aspect_ratio)
|
||||
|
||||
_test_image_prompt_replacements(
|
||||
processor,
|
||||
num_imgs=num_imgs,
|
||||
image_sizes=image_sizes,
|
||||
)
|
||||
132
tests/models/multimodal/processing/test_llava_onevision.py
Normal file
132
tests/models/multimodal/processing/test_llava_onevision.py
Normal file
@@ -0,0 +1,132 @@
|
||||
import itertools
|
||||
from functools import partial
|
||||
|
||||
import pytest
|
||||
from PIL import Image
|
||||
from pqdm.threads import pqdm
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.parse import ImageSize
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
def _validate_image_prompt_replacements_one(
|
||||
processor: BaseMultiModalProcessor,
|
||||
num_imgs: int,
|
||||
failed_size_excs: list[tuple[ImageSize, Exception]],
|
||||
image_size: ImageSize,
|
||||
) -> None:
|
||||
prompt = "<image>" * num_imgs
|
||||
image = Image.new("RGB", size=image_size)
|
||||
mm_data = {"image": [image] * num_imgs}
|
||||
|
||||
try:
|
||||
# The processor will throw an error if there is a mismatch
|
||||
# in the prompt replacements
|
||||
processed_inputs = processor.apply(prompt, mm_data, {})
|
||||
|
||||
image_placeholders = processed_inputs["mm_placeholders"]["image"]
|
||||
assert len(image_placeholders) == num_imgs
|
||||
|
||||
first_placeholder = image_placeholders[0]
|
||||
|
||||
assert first_placeholder["offset"] == 0
|
||||
assert first_placeholder["length"] == len(
|
||||
processed_inputs["prompt_token_ids"]) // num_imgs
|
||||
except Exception as exc:
|
||||
failed_size_excs.append((image_size, exc))
|
||||
|
||||
|
||||
def _test_image_prompt_replacements(
|
||||
processor,
|
||||
*,
|
||||
num_imgs: int,
|
||||
image_sizes: list[ImageSize],
|
||||
) -> None:
|
||||
"""
|
||||
Ensure LlavaOnevisionMultiModalProcessor
|
||||
handles prompt replacement properly for input images.
|
||||
"""
|
||||
failed_size_excs = list[tuple[ImageSize, Exception]]()
|
||||
|
||||
validate_one = partial(
|
||||
_validate_image_prompt_replacements_one,
|
||||
processor,
|
||||
num_imgs,
|
||||
failed_size_excs,
|
||||
)
|
||||
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" \
|
||||
+ "\n========\n".join(f"[{size}]\n{exc}"
|
||||
for size, exc in failed_size_excs)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
|
||||
)
|
||||
|
||||
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
|
||||
(488, 183), (2560, 1669)]
|
||||
image_sizes = [
|
||||
size for w, h in image_ratios
|
||||
for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
]
|
||||
|
||||
_test_image_prompt_replacements(
|
||||
processor,
|
||||
num_imgs=num_imgs,
|
||||
image_sizes=image_sizes,
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip("This test takes around 2 hours to run. "
|
||||
"Comment this out to run it manually.")
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
@pytest.mark.parametrize("num_imgs", [1])
|
||||
def test_processor_prompt_replacements_all(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=cached_get_tokenizer(ctx.model_config.tokenizer),
|
||||
)
|
||||
|
||||
seen_aspect_ratios = set[float]()
|
||||
image_sizes = list[ImageSize]()
|
||||
|
||||
# The aspect ratio of the grid layout is between 1 and 6
|
||||
# NOTE: Assumes that feature size calculation is the same if we
|
||||
# swap the width and height of the image
|
||||
for w, h in itertools.product(range(64, 1024), repeat=2):
|
||||
aspect_ratio = w / h
|
||||
if 1 <= aspect_ratio <= 6 and aspect_ratio not in seen_aspect_ratios:
|
||||
image_sizes.append(ImageSize(w, h))
|
||||
seen_aspect_ratios.add(aspect_ratio)
|
||||
|
||||
_test_image_prompt_replacements(
|
||||
processor,
|
||||
num_imgs=num_imgs,
|
||||
image_sizes=image_sizes,
|
||||
)
|
||||
55
tests/models/multimodal/processing/test_phi3v.py
Normal file
55
tests/models/multimodal/processing/test_phi3v.py
Normal file
@@ -0,0 +1,55 @@
|
||||
"""Tests for phi3v's multimodal preprocessing kwargs."""
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
|
||||
from ....conftest import _ImageAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["microsoft/Phi-3.5-vision-instruct"])
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("mm_processor_kwargs", "expected_toks_per_img"),
|
||||
[
|
||||
({"num_crops": 4}, 757),
|
||||
({"num_crops": 16}, 1921),
|
||||
# the default num_crops of phi-3.5-vision is 4
|
||||
({}, 757),
|
||||
])
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
def test_processor_override(
|
||||
image_assets: _ImageAssets,
|
||||
model_id: str,
|
||||
mm_processor_kwargs: dict[str, int],
|
||||
expected_toks_per_img: int,
|
||||
num_imgs: int,
|
||||
):
|
||||
"""Ensure input_processor_for_phi3v handles num_crops properly."""
|
||||
# Avoid initializing CUDA early
|
||||
from vllm.model_executor.models.phi3v import _IMAGE_TOKEN_ID
|
||||
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
trust_remote_code=True,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
|
||||
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
|
||||
mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(_IMAGE_TOKEN_ID)
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
144
tests/models/multimodal/processing/test_qwen.py
Normal file
144
tests/models/multimodal/processing/test_qwen.py
Normal file
@@ -0,0 +1,144 @@
|
||||
"""Tests for Qwen's multimodal preprocessing kwargs."""
|
||||
from typing import Dict, List, Union
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
|
||||
from vllm.inputs import InputContext, token_inputs
|
||||
from vllm.multimodal import MultiModalKwargs
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
|
||||
from ....conftest import IMAGE_ASSETS
|
||||
from ...utils import build_model_context
|
||||
|
||||
### Multimodal preprocessing tests
|
||||
SAMPLE_IMAGE = IMAGE_ASSETS[0].pil_image
|
||||
# These values are specific to Qwen-VL/Chat; we can get these from the model
|
||||
# config also, but they are hardcoded here to keep the parameterize/fixtures
|
||||
# easy to read.
|
||||
IMG_START_ID = 151857
|
||||
IMG_END_ID = 151858
|
||||
IMG_PAD_ID = 151859
|
||||
TOKS_PER_IMG = 256
|
||||
VIS_ENC_DIM = 4096
|
||||
IMG_SIZE = 448
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def input_mapper_for_qwen():
|
||||
# Lazy import to avoid initializing CUDA during test collection
|
||||
from vllm.model_executor.models.qwen import input_mapper_for_qwen
|
||||
return input_mapper_for_qwen
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def input_processor_for_qwen():
|
||||
# Lazy import to avoid initializing CUDA during test collection
|
||||
from vllm.model_executor.models.qwen import input_processor_for_qwen
|
||||
return input_processor_for_qwen
|
||||
|
||||
|
||||
@pytest.fixture()
|
||||
def qwen_vl_context() -> InputContext:
|
||||
"""Get an InputContext for Qwen-VL."""
|
||||
return build_model_context(model_name="Qwen/Qwen-VL",
|
||||
trust_remote_code=True)
|
||||
|
||||
|
||||
# Happy path tests for single/multi-image scenarios for the multimodal
|
||||
# input processor and mapper, respectively
|
||||
@pytest.mark.parametrize("num_images", [1, 2])
|
||||
def test_input_processor_valid_mm_data(input_processor_for_qwen,
|
||||
qwen_vl_context: InputContext,
|
||||
num_images: int):
|
||||
"""Happy cases for image inputs to Qwen's multimodal input processor."""
|
||||
prompt = "".join(
|
||||
[f"Picture {num}: <img></img>\n" for num in range(1, num_images + 1)])
|
||||
inputs = token_inputs(
|
||||
prompt=prompt,
|
||||
# When processing multimodal data for a multimodal model, the qwen
|
||||
# input processor will overwrite the provided prompt_token_ids with
|
||||
# the image prompts
|
||||
prompt_token_ids=[],
|
||||
multi_modal_data={"image": torch.rand(num_images, TOKS_PER_IMG, 4096)},
|
||||
)
|
||||
proc_inputs = input_processor_for_qwen(qwen_vl_context, inputs)
|
||||
assert isinstance(proc_inputs, dict)
|
||||
|
||||
# Each image should have one start / stop and a fixed context of 256
|
||||
proc_tokens = proc_inputs["prompt_token_ids"]
|
||||
assert proc_tokens.count(IMG_START_ID) == num_images
|
||||
assert proc_tokens.count(IMG_END_ID) == num_images
|
||||
assert proc_tokens.count(IMG_PAD_ID) == num_images * TOKS_PER_IMG
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"img_data,expected_shape",
|
||||
[
|
||||
# single / multi-image
|
||||
(SAMPLE_IMAGE, (1, 3, IMG_SIZE, IMG_SIZE)),
|
||||
(2 * [SAMPLE_IMAGE], (2, 3, IMG_SIZE, IMG_SIZE)),
|
||||
# single / multi-image embeddings
|
||||
(torch.rand(
|
||||
(TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
|
||||
(torch.rand(
|
||||
(1, TOKS_PER_IMG, VIS_ENC_DIM)), (1, TOKS_PER_IMG, VIS_ENC_DIM)),
|
||||
(torch.rand(
|
||||
(2, TOKS_PER_IMG, VIS_ENC_DIM)), (2, TOKS_PER_IMG, VIS_ENC_DIM)),
|
||||
])
|
||||
def test_input_mapper_valid_mm_data(input_mapper_for_qwen,
|
||||
qwen_vl_context: InputContext,
|
||||
img_data: Union[torch.Tensor, List[Image],
|
||||
Image],
|
||||
expected_shape: List[int]):
|
||||
"""Happy cases for image inputs to Qwen's multimodal input mapper."""
|
||||
mapped_img_data = input_mapper_for_qwen(qwen_vl_context, img_data)
|
||||
# Ensure that we get the appropriately shaped pixel_values
|
||||
# for images and image embeddings, respectively.
|
||||
assert isinstance(mapped_img_data, MultiModalKwargs)
|
||||
assert "pixel_values" in mapped_img_data
|
||||
assert mapped_img_data["pixel_values"].shape == expected_shape
|
||||
|
||||
|
||||
# Sad path tests for the multimodal input processor and mapper, respectively
|
||||
@pytest.mark.parametrize("mm_data", [
|
||||
{
|
||||
"image": torch.rand(5)
|
||||
},
|
||||
{
|
||||
"image": torch.rand((5, 5, 5, 5, 5))
|
||||
},
|
||||
])
|
||||
def test_input_processor_invalid_mm_data(input_processor_for_qwen,
|
||||
qwen_vl_context: InputContext,
|
||||
mm_data: Dict[str, torch.Tensor]):
|
||||
"""Test sad cases validated in Qwen's multimodal input processor."""
|
||||
tokenizer = cached_get_tokenizer(qwen_vl_context.model_config.tokenizer,
|
||||
trust_remote_code=True)
|
||||
prompt = "Picture 1: <img></img>\n"
|
||||
prompt_token_ids = tokenizer.encode(prompt)
|
||||
inputs = token_inputs(prompt=prompt,
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
multi_modal_data=mm_data)
|
||||
# Should fail since we have too many or too few dimensions for embeddings
|
||||
with pytest.raises(ValueError):
|
||||
input_processor_for_qwen(qwen_vl_context, inputs)
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"img_data",
|
||||
[
|
||||
# Wrong context length
|
||||
torch.rand((1, TOKS_PER_IMG + 10, VIS_ENC_DIM)),
|
||||
# Wrong visual encoder output size
|
||||
torch.rand((1, TOKS_PER_IMG, VIS_ENC_DIM + 10)),
|
||||
])
|
||||
def test_input_mapper_invalid_mm_data(
|
||||
input_mapper_for_qwen,
|
||||
qwen_vl_context: InputContext,
|
||||
img_data: Union[torch.Tensor, List[Image], Image],
|
||||
):
|
||||
"""Sad cases validated in Qwen VL's multimodal input mapper."""
|
||||
with pytest.raises(ValueError):
|
||||
input_mapper_for_qwen(qwen_vl_context, img_data)
|
||||
54
tests/models/multimodal/processing/test_qwen2_vl.py
Normal file
54
tests/models/multimodal/processing/test_qwen2_vl.py
Normal file
@@ -0,0 +1,54 @@
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
from vllm.multimodal.utils import cached_get_tokenizer
|
||||
|
||||
from ....conftest import _ImageAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [
|
||||
({}, 1426, (5704, 1176)),
|
||||
({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
|
||||
])
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
def test_processor_override(
|
||||
image_assets: _ImageAssets,
|
||||
model_id: str,
|
||||
mm_processor_kwargs: dict[str, object],
|
||||
expected_toks_per_img: int,
|
||||
expected_pixels_shape: tuple[int, int],
|
||||
num_imgs: int,
|
||||
):
|
||||
"""Ensure Qwen2VLMultiModalProcessor handles min/max pixels properly."""
|
||||
ctx = build_model_context(
|
||||
model_name=model_id,
|
||||
tokenizer_name=model_id,
|
||||
mm_processor_kwargs=None,
|
||||
limit_mm_per_prompt={"image": num_imgs},
|
||||
)
|
||||
tokenizer = cached_get_tokenizer(ctx.model_config.tokenizer)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(
|
||||
ctx.model_config,
|
||||
tokenizer=tokenizer,
|
||||
)
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
prompt = "<|vision_start|><|image_pad|><|vision_end|>" * num_imgs
|
||||
mm_data = {"image": [image_assets[0].pil_image] * num_imgs}
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
hf_processor = processor.info.get_hf_processor(**mm_processor_kwargs)
|
||||
image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"]["pixel_values"].shape
|
||||
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
|
||||
assert pixel_shape[1] == expected_pixels_shape[1]
|
||||
Reference in New Issue
Block a user