Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -6,22 +6,27 @@ from typing import Optional, Union
import numpy as np
import pytest
from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
UserMessage)
from mistral_common.protocol.instruct.messages import ImageChunk, TextChunk, UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from PIL import Image
from vllm.config import ModelConfig
from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
ImageDummyOptions, VideoDummyOptions)
from vllm.config.multimodal import (
AudioDummyOptions,
BaseDummyOptions,
ImageDummyOptions,
VideoDummyOptions,
)
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
from vllm.multimodal.inputs import MultiModalInputs
from vllm.multimodal.processing import (BaseMultiModalProcessor,
InputProcessingContext)
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
cached_tokenizer_from_config,
encode_tokens)
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.transformers_utils.tokenizer import (
AnyTokenizer,
MistralTokenizer,
cached_tokenizer_from_config,
encode_tokens,
)
from ....multimodal.utils import random_audio, random_image, random_video
from ...registry import HF_EXAMPLE_MODELS
@@ -36,14 +41,17 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
# GLM4.1V doesn't support multiple videos
video = mm_data["video"]
num_frames = len(video)
mm_data["video"] = (video, {
"total_num_frames": num_frames,
"fps": num_frames,
"duration": 1,
"frames_indices": [i for i in range(num_frames)],
"video_backend": "opencv",
"do_sample_frames": True,
})
mm_data["video"] = (
video,
{
"total_num_frames": num_frames,
"fps": num_frames,
"duration": 1,
"frames_indices": [i for i in range(num_frames)],
"video_backend": "opencv",
"do_sample_frames": True,
},
)
return mm_data
@@ -102,7 +110,8 @@ def _test_processing_correctness(
mm_processor_cache_gb=2048,
skip_tokenizer_init=model_info.skip_tokenizer_init,
enforce_eager=model_info.enforce_eager,
dtype=model_info.dtype)
dtype=model_info.dtype,
)
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
@@ -145,27 +154,22 @@ def _test_processing_correctness(
input_to_hit = {
"image": Image.new("RGB", size=(128, 128)),
"video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
"audio": (np.zeros((512, )), 16000),
"audio": (np.zeros((512,)), 16000),
}
input_factory = {
"image":
partial(random_image, rng, min_wh=128, max_wh=256),
"video":
partial(random_video,
rng,
min_frames=2,
max_frames=16,
min_wh=128,
max_wh=256),
"audio":
partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
"image": partial(random_image, rng, min_wh=128, max_wh=256),
"video": partial(
random_video, rng, min_frames=2, max_frames=16, min_wh=128, max_wh=256
),
"audio": partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
}
for batch_idx in range(num_batches):
mm_data = {
k:
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
for _ in range(rng.randint(limit + 1))]
k: [
(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
for _ in range(rng.randint(limit + 1))
]
for k, limit in limit_mm_per_prompt_ints.items()
}
@@ -174,12 +178,16 @@ def _test_processing_correctness(
# Mistral chat outputs tokens directly, rather than text prompts
if isinstance(tokenizer, MistralTokenizer):
images = mm_data.get("image", [])
request = ChatCompletionRequest(messages=[
UserMessage(content=[
TextChunk(text=""),
*(ImageChunk(image=image) for image in images),
]),
])
request = ChatCompletionRequest(
messages=[
UserMessage(
content=[
TextChunk(text=""),
*(ImageChunk(image=image) for image in images),
]
),
]
)
res = tokenizer.mistral.encode_chat_completion(request)
prompt = res.tokens
else:
@@ -303,16 +311,14 @@ def _test_processing_correctness_one(
baseline_text_result,
baseline_tokenized_result,
ignore_mm_keys=ignore_mm_keys,
msg=f"Failed ({batch_idx=}, {text_prompt=}, "
f"{token_prompt=}, {mm_data=})",
msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
)
_assert_inputs_equal(
cached_text_result,
cached_tokenized_result,
ignore_mm_keys=ignore_mm_keys,
msg=f"Failed ({batch_idx=}, {text_prompt=}, "
f"{token_prompt=}, {mm_data=})",
msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
)

View File

@@ -24,7 +24,8 @@ from ...utils import build_model_context
# post-sampled frames (expected behavior)
(-1, 1, 5),
(-1, 2, 10),
])
],
)
def test_processor_override(
model_id: str,
expected_toks_per_frame: int,
@@ -55,10 +56,8 @@ def test_processor_override(
# Ensure we have the right number of placeholders per num_crops size
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
video_tok_count = processed_inputs["prompt_token_ids"].count(
video_token_id)
grid_t, _, _ = processed_inputs["mm_kwargs"].get_data(
)["video_grid_thw"][0]
video_tok_count = processed_inputs["prompt_token_ids"].count(video_token_id)
grid_t, _, _ = processed_inputs["mm_kwargs"].get_data()["video_grid_thw"][0]
assert grid_t == expected_grid_t
assert video_tok_count == expected_toks_per_frame * grid_t
@@ -71,7 +70,7 @@ def test_video_loader_consistency(
fps: int,
):
"""
Ensure dynamic video loader (pre-sampled by loader) and normal video
Ensure dynamic video loader (pre-sampled by loader) and normal video
loader (post-sampled by processor) produce same video processing outputs.
"""
ctx = build_model_context(
@@ -91,7 +90,8 @@ def test_video_loader_consistency(
static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
video_bytes, fps=fps)
video_bytes, fps=fps
)
# pre-sampled loader shouldn't read all frames
assert len(dynamic_video) < len(static_video)
@@ -99,12 +99,11 @@ def test_video_loader_consistency(
static_mm_data = {"video": [(static_video, static_metadata)]}
dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
static_outputs = processor.apply(prompt, static_mm_data,
hf_processor_mm_kwargs)
dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
hf_processor_mm_kwargs)
static_outputs = processor.apply(prompt, static_mm_data, hf_processor_mm_kwargs)
dynamic_outputs = processor.apply(prompt, dynamic_mm_data, hf_processor_mm_kwargs)
assert static_outputs["prompt_token_ids"] == dynamic_outputs[
"prompt_token_ids"]
assert static_outputs["mm_kwargs"].get_data(
) == dynamic_outputs["mm_kwargs"].get_data()
assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"]
assert (
static_outputs["mm_kwargs"].get_data()
== dynamic_outputs["mm_kwargs"].get_data()
)

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for H2OVL's multimodal preprocessing kwargs."""
from collections.abc import Mapping
from typing import Optional
@@ -23,8 +24,10 @@ def _get_expected_num_patches(
min_num: int,
max_num: int,
):
from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
get_h2ovl_target_ratios)
from vllm.model_executor.models.h2ovl import (
calculate_h2ovl_targets,
get_h2ovl_target_ratios,
)
width, height = image.size
@@ -101,24 +104,27 @@ def _run_check(
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
for image in images
)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"].get_data(
)["pixel_values_flat"].shape
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize("model_id", [
"h2oai/h2ovl-mississippi-800m",
"h2oai/h2ovl-mississippi-2b",
])
@pytest.mark.parametrize(
"model_id",
[
"h2oai/h2ovl-mississippi-800m",
"h2oai/h2ovl-mississippi-2b",
],
)
@pytest.mark.parametrize(
"size_factors",
[
@@ -165,10 +171,7 @@ def test_processor_override(
_run_check(
processor,
[
rescale_image_size(image_assets[0].pil_image, f)
for f in size_factors
],
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
min_num,
max_num,
hf_processor_mm_kwargs,

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for Idefics3's multimodal preprocessing kwargs."""
import pytest
from transformers import Idefics3Config
@@ -17,7 +18,8 @@ from ...utils import build_model_context
[
({"size": {"longest_edge": 364}}, 169),
({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
])
],
)
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
@@ -42,8 +44,11 @@ def test_processor_override(
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
placeholders = "<image>" if num_imgs == 1 else "\n".join(
f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
placeholders = (
"<image>"
if num_imgs == 1
else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
)
prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
# Build mm_data
@@ -57,8 +62,7 @@ def test_processor_override(
# Ensure the placeholders format are correct
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
"input_ids"][0]
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
# Ensure we have the right number of placeholders per num_crops size
image_token_id = ctx.get_hf_config().image_token_id

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for InternVL's multimodal preprocessing kwargs."""
from collections.abc import Mapping
from typing import Optional
@@ -24,7 +25,9 @@ def _get_expected_num_patches(
max_num: int,
):
from vllm.model_executor.models.internvl import (
calculate_internvl_targets, get_internvl_target_ratios)
calculate_internvl_targets,
get_internvl_target_ratios,
)
width, height = image.size
@@ -61,15 +64,15 @@ def _run_check(
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
for image in images
)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"].get_data(
)["pixel_values_flat"].shape
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@@ -122,10 +125,7 @@ def test_processor_override(
_run_check(
processor,
[
rescale_image_size(image_assets[0].pil_image, f)
for f in size_factors
],
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
min_num,
max_num,
hf_processor_mm_kwargs,

View File

@@ -11,8 +11,7 @@ from ....conftest import ImageTestAssets
from ...utils import build_model_context
@pytest.mark.parametrize("model_id",
["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
@pytest.mark.parametrize("model_id", ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
@pytest.mark.parametrize("mm_processor_kwargs", [{}])
@pytest.mark.parametrize("num_imgs", [1, 5])
@pytest.mark.parametrize("mm_processor_cache_gb", [0, 4])
@@ -38,13 +37,14 @@ def test_processor_override(
hf_processor = processor.info.get_hf_processor()
vocab = tokenizer.get_vocab()
prompt = "<|begin_of_text|><|header_start|>user<|header_end|>" \
+ "<|image|>" * num_imgs \
prompt = (
"<|begin_of_text|><|header_start|>user<|header_end|>"
+ "<|image|>" * num_imgs
+ "<|eot|><|header_start|>assistant<|header_end|>"
)
mm_data = {
"image": [
image_assets[(i % len(image_assets))].pil_image
for i in range(num_imgs)
image_assets[(i % len(image_assets))].pil_image for i in range(num_imgs)
]
}
if tokenized_prompt:
@@ -64,22 +64,23 @@ def test_processor_override(
if tiles_x * tiles_y > 1:
num_x_separators += (tiles_x - 1) * tiles_y
num_y_separators += tiles_y
assert prompt_token_ids.count(vocab[hf_processor.tile_token]) \
== num_x_separators
assert prompt_token_ids.count(vocab[hf_processor.tile_global_token]) \
== num_y_separators
assert prompt_token_ids.count(vocab[hf_processor.tile_token]) == num_x_separators
assert (
prompt_token_ids.count(vocab[hf_processor.tile_global_token])
== num_y_separators
)
# image token offsets
img_locs = processed_inputs["mm_placeholders"].get("image", [])
assert len(img_locs) == num_imgs
assert [img_loc.offset for img_loc in img_locs] == \
[i for i, v in enumerate(prompt_token_ids) \
if v == config.boi_token_index]
assert [img_loc.offset for img_loc in img_locs] == [
i for i, v in enumerate(prompt_token_ids) if v == config.boi_token_index
]
# patch sizes and masks
num_patches_per_chunk = processor.info.get_patch_per_chunk(
config.vision_config)
assert prompt_token_ids.count(config.image_token_index) \
num_patches_per_chunk = processor.info.get_patch_per_chunk(config.vision_config)
assert (
prompt_token_ids.count(config.image_token_index)
== sum(mm_data["patches_per_image"]) * num_patches_per_chunk
assert len(mm_data["pixel_values"]) \
== sum(mm_data["patches_per_image"])
)
assert len(mm_data["pixel_values"]) == sum(mm_data["patches_per_image"])

View File

@@ -22,8 +22,9 @@ def _validate_image_max_tokens_one(
image_size: ImageSize,
) -> None:
info = processor.info
feature_size = info.get_num_image_tokens(image_width=image_size.width,
image_height=image_size.height)
feature_size = info.get_num_image_tokens(
image_width=image_size.width, image_height=image_size.height
)
try:
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
@@ -31,8 +32,9 @@ def _validate_image_max_tokens_one(
failed_size_excs.append((image_size, exc))
@pytest.mark.skip("This test takes around 5 minutes to run. "
"Comment this out to run it manually.")
@pytest.mark.skip(
"This test takes around 5 minutes to run. Comment this out to run it manually."
)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
def test_processor_max_tokens(model_id):
ctx = build_model_context(
@@ -66,9 +68,9 @@ def test_processor_max_tokens(model_id):
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
msg = "Found failing image sizes:" + "\n========\n".join(
f"[{size}]\n{exc}" for size, exc in failed_size_excs
)
raise AssertionError(msg)
@@ -94,8 +96,10 @@ def _validate_image_prompt_replacements_one(
# NOTE: There is a BOS token
assert first_placeholder.offset == 1
assert first_placeholder.length == (
len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
assert (
first_placeholder.length
== (len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
)
except Exception as exc:
failed_size_excs.append((image_size, exc))
@@ -122,9 +126,9 @@ def _test_image_prompt_replacements(
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
msg = "Found failing image sizes:" + "\n========\n".join(
f"[{size}]\n{exc}" for size, exc in failed_size_excs
)
raise AssertionError(msg)
@@ -138,11 +142,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
(488, 183), (2560, 1669)]
image_ratios = [
(171, 152),
(184, 161),
(198, 176),
(333, 296),
(369, 328),
(488, 183),
(2560, 1669),
]
image_sizes = [
size for w, h in image_ratios
for size in [ImageSize(w, h), ImageSize(h, w)]
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
]
_test_image_prompt_replacements(
@@ -152,8 +162,9 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
@pytest.mark.skip("This test takes around 2 hours to run. "
"Comment this out to run it manually.")
@pytest.mark.skip(
"This test takes around 2 hours to run. Comment this out to run it manually."
)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
@pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(model_id, num_imgs):

View File

@@ -22,8 +22,9 @@ def _validate_image_max_tokens_one(
image_size: ImageSize,
) -> None:
info = processor.info
feature_size = info.get_num_image_tokens(image_width=image_size.width,
image_height=image_size.height)
feature_size = info.get_num_image_tokens(
image_width=image_size.width, image_height=image_size.height
)
try:
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
@@ -31,10 +32,10 @@ def _validate_image_max_tokens_one(
failed_size_excs.append((image_size, exc))
@pytest.mark.skip("This test takes around 5 minutes to run. "
"Comment this out to run it manually.")
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.skip(
"This test takes around 5 minutes to run. Comment this out to run it manually."
)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
def test_processor_max_tokens(model_id):
ctx = build_model_context(
model_id,
@@ -67,9 +68,9 @@ def test_processor_max_tokens(model_id):
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
msg = "Found failing image sizes:" + "\n========\n".join(
f"[{size}]\n{exc}" for size, exc in failed_size_excs
)
raise AssertionError(msg)
@@ -94,8 +95,10 @@ def _validate_image_prompt_replacements_one(
first_placeholder = image_placeholders[0]
assert first_placeholder.offset == 0
assert first_placeholder.length == len(
processed_inputs["prompt_token_ids"]) // num_imgs
assert (
first_placeholder.length
== len(processed_inputs["prompt_token_ids"]) // num_imgs
)
except Exception as exc:
failed_size_excs.append((image_size, exc))
@@ -121,14 +124,13 @@ def _test_image_prompt_replacements(
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
msg = "Found failing image sizes:" + "\n========\n".join(
f"[{size}]\n{exc}" for size, exc in failed_size_excs
)
raise AssertionError(msg)
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("num_imgs", [1, 2])
def test_processor_prompt_replacements_regression(model_id, num_imgs):
ctx = build_model_context(
@@ -138,11 +140,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
(488, 183), (2560, 1669)]
image_ratios = [
(171, 152),
(184, 161),
(198, 176),
(333, 296),
(369, 328),
(488, 183),
(2560, 1669),
]
image_sizes = [
size for w, h in image_ratios
for size in [ImageSize(w, h), ImageSize(h, w)]
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
]
_test_image_prompt_replacements(
@@ -152,10 +160,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
@pytest.mark.skip("This test takes around 2 hours to run. "
"Comment this out to run it manually.")
@pytest.mark.parametrize("model_id",
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.skip(
"This test takes around 2 hours to run. Comment this out to run it manually."
)
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
@pytest.mark.parametrize("num_imgs", [1])
def test_processor_prompt_replacements_all(model_id, num_imgs):
ctx = build_model_context(

View File

@@ -61,17 +61,17 @@ def _test_image_prompt_replacements(
num_imgs: int,
image_sizes: list[ImageSize],
) -> None:
failed_size_excs = list[tuple[ImageSize, Exception]]()
for size in image_sizes:
_validate_image_prompt_replacements_one(processor, num_imgs,
failed_size_excs, size)
_validate_image_prompt_replacements_one(
processor, num_imgs, failed_size_excs, size
)
if failed_size_excs:
msg = "Found failing image sizes:" \
+ "\n========\n".join(f"[{size}]\n{exc}"
for size, exc in failed_size_excs)
msg = "Found failing image sizes:" + "\n========\n".join(
f"[{size}]\n{exc}" for size, exc in failed_size_excs
)
raise AssertionError(msg)
@@ -85,11 +85,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
)
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
(488, 183), (2560, 1669)]
image_ratios = [
(171, 152),
(184, 161),
(198, 176),
(333, 296),
(369, 328),
(488, 183),
(2560, 1669),
]
image_sizes = [
size for w, h in image_ratios
for size in [ImageSize(w, h), ImageSize(h, w)]
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
]
_test_image_prompt_replacements(

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for mllama's multimodal preprocessing and profiling."""
import pytest
from torch import prod
from transformers import Llama4Config
@@ -47,14 +48,17 @@ def test_profiling(model_id: str, max_model_len: int):
image_size = hf_config.vision_config.image_size
patch_size = hf_config.vision_config.patch_size
downsample_ratio = int(
round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2)))
tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio
round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))
)
tokens_per_patch = ((image_size // patch_size) ** 2) // downsample_ratio
chunks_per_image = prod(mm_data["patches_per_image"])
total_num_patches = chunks_per_image * tokens_per_patch
num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][
1] # x-y separator tokens
total_tokens = total_num_patches.item() + num_tiles.item(
) + 3 # image start, image, image end
num_tiles = (
mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][1]
) # x-y separator tokens
total_tokens = (
total_num_patches.item() + num_tiles.item() + 3
) # image start, image, image end
profiled_tokens = profiler.get_mm_max_contiguous_tokens(
max_model_len,
@@ -63,5 +67,6 @@ def test_profiling(model_id: str, max_model_len: int):
assert total_tokens == profiled_tokens["image"]
assert total_tokens == sum(
placeholder.length for placeholder in
decoder_dummy_data.multi_modal_placeholders["image"])
placeholder.length
for placeholder in decoder_dummy_data.multi_modal_placeholders["image"]
)

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
from collections.abc import Mapping
from typing import Optional
@@ -24,7 +25,9 @@ def _get_expected_num_patches(
max_num: int,
):
from vllm.model_executor.models.nemotron_vl import (
calculate_nemotron_vl_targets, get_nemotron_vl_target_ratios)
calculate_nemotron_vl_targets,
get_nemotron_vl_target_ratios,
)
width, height = image.size
@@ -63,22 +66,21 @@ def _run_check(
total_expected_num_patches = sum(
_get_expected_num_patches(config, image, len(images), min_num, max_num)
for image in images)
for image in images
)
print(total_expected_num_patches)
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
# Ensure we have the right number of placeholders per num_crops size
image_token_id = tokenizer.convert_tokens_to_ids("<image>")
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"].get_data(
)["pixel_values_flat"].shape
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
assert img_tok_count == 256 * total_expected_num_patches
assert pixel_shape[0] == total_expected_num_patches
@pytest.mark.parametrize("model_id",
["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
@pytest.mark.parametrize("model_id", ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
@pytest.mark.parametrize(
"size_factors",
[
@@ -125,10 +127,7 @@ def test_processor_override(
_run_check(
processor,
[
rescale_image_size(image_assets[0].pil_image, f)
for f in size_factors
],
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
min_num,
max_num,
hf_processor_mm_kwargs,

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for phi3v's multimodal preprocessing kwargs."""
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -18,7 +19,8 @@ from ...utils import build_model_context
({"num_crops": 16}, 1921),
# the default num_crops of phi-3.5-vision is 4
({}, 757),
])
],
)
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for phi4mm's multimodal preprocessing kwargs."""
import pytest
from vllm.multimodal import MULTIMODAL_REGISTRY
@@ -18,7 +19,8 @@ from ...utils import build_model_context
({"dynamic_hd": 16}, 4433),
# the default num_crops of phi-4-multimodal is 36
({}, 9585),
])
],
)
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
@@ -46,8 +48,7 @@ def test_processor_override(
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
image_size = ctx.get_hf_config(
).embd_layer["image_embd_layer"]["crop_size"]
image_size = ctx.get_hf_config().embd_layer["image_embd_layer"]["crop_size"]
dummy_image_size = (image_size * 7, image_size * 7)
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
mm_data = {"image": [dummy_image] * num_imgs}
@@ -56,5 +57,6 @@ def test_processor_override(
# Ensure we have the right number of placeholders per num_crops size
img_tok_count = processed_inputs["prompt_token_ids"].count(
_IMAGE_PLACEHOLDER_TOKEN_ID)
_IMAGE_PLACEHOLDER_TOKEN_ID
)
assert img_tok_count == expected_toks_per_img * num_imgs

View File

@@ -12,10 +12,12 @@ from ...utils import build_model_context
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
# yapf: disable
@pytest.mark.parametrize(
("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [
("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"),
[
({}, 1426, (5704, 1176)),
({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
])
],
)
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
@@ -48,8 +50,7 @@ def test_processor_override(
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
pixel_shape = processed_inputs["mm_kwargs"].get_data(
)["pixel_values"].shape
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values"].shape
assert img_tok_count == expected_toks_per_img * num_imgs
assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Tests for smolvlm's multimodal preprocessing kwargs."""
import pytest
from transformers import SmolVLMConfig
@@ -17,7 +18,8 @@ from ...utils import build_model_context
[
({"max_image_size": {"longest_edge": 384}}, 1377),
({"max_image_size": {"longest_edge": 768}}, 405),
])
],
)
# yapf: enable
@pytest.mark.parametrize("num_imgs", [1, 2])
@pytest.mark.parametrize("kwargs_on_init", [True, False])
@@ -42,8 +44,11 @@ def test_processor_override(
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
# Build the image str / prompt based on the number of images we pass
placeholders = "<image>" if num_imgs == 1 else "\n".join(
f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
placeholders = (
"<image>"
if num_imgs == 1
else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
)
prompt = f"<|im_start|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
# Build mm_data
@@ -57,8 +62,7 @@ def test_processor_override(
# Ensure the placeholders format are correct
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
"input_ids"][0]
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
# Ensure we have the right number of placeholders per num_crops size
image_token_id = ctx.get_hf_config().image_token_id

View File

@@ -9,23 +9,29 @@ from typing import Any, Union
import numpy as np
import pytest
import torch.nn as nn
from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
UserMessage)
from mistral_common.protocol.instruct.messages import ImageChunk, TextChunk, UserMessage
from mistral_common.protocol.instruct.request import ChatCompletionRequest
from PIL import Image
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
ImageDummyOptions, VideoDummyOptions)
from vllm.distributed import (cleanup_dist_env_and_memory,
init_distributed_environment,
initialize_model_parallel)
from vllm.config.multimodal import (
AudioDummyOptions,
BaseDummyOptions,
ImageDummyOptions,
VideoDummyOptions,
)
from vllm.distributed import (
cleanup_dist_env_and_memory,
init_distributed_environment,
initialize_model_parallel,
)
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
from vllm.model_executor.models.interfaces import (SupportsMultiModal,
supports_multimodal)
from vllm.model_executor.models.interfaces import (
SupportsMultiModal,
supports_multimodal,
)
from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
from vllm.multimodal.processing import (BaseMultiModalProcessor,
InputProcessingContext)
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
from vllm.multimodal.utils import group_mm_kwargs_by_modality
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
from vllm.utils import is_list_of
@@ -48,13 +54,15 @@ REPO_ID_TO_SKIP = {
}
ImageInput = list[Image.Image]
VideoInput = Union[list[Image.Image], list[np.ndarray],
list[tuple[np.ndarray, dict[str, Any]]]]
VideoInput = Union[
list[Image.Image], list[np.ndarray], list[tuple[np.ndarray, dict[str, Any]]]
]
AudioInput = list[tuple[np.ndarray, int]]
def _resize_data(_data: Union[Image.Image, np.ndarray],
size_factor: float) -> Union[Image.Image, np.ndarray]:
def _resize_data(
_data: Union[Image.Image, np.ndarray], size_factor: float
) -> Union[Image.Image, np.ndarray]:
assert size_factor <= 1, "Size factor must be less than 1"
# Image input
if isinstance(_data, Image.Image):
@@ -74,20 +82,18 @@ def _resize_data(_data: Union[Image.Image, np.ndarray],
return _data[..., :T, :H, :W, :C]
# Audio input
elif isinstance(_data, np.ndarray) and _data.ndim == 1:
return _data[:int(len(_data) * size_factor)]
return _data[: int(len(_data) * size_factor)]
raise AssertionError("This line should be unreachable.")
def resize_mm_data(
data: Union[ImageInput, VideoInput, AudioInput],
size_factors: tuple[float,
...]) -> Union[ImageInput, VideoInput, AudioInput]:
size_factors = size_factors[:len(data)]
data: Union[ImageInput, VideoInput, AudioInput], size_factors: tuple[float, ...]
) -> Union[ImageInput, VideoInput, AudioInput]:
size_factors = size_factors[: len(data)]
if is_list_of(data, (Image.Image, np.ndarray, list)):
return [_resize_data(d, s) for d, s in zip(data, size_factors)]
elif is_list_of(data, tuple):
return [(_resize_data(d, s), meta)
for (d, meta), s in zip(data, size_factors)]
return [(_resize_data(d, s), meta) for (d, meta), s in zip(data, size_factors)]
raise ValueError("Unsupported multimodal data type.")
@@ -116,12 +122,16 @@ def create_batched_mm_kwargs(
# Mistral chat outputs tokens directly, rather than text prompts
if model_config.tokenizer_mode == "mistral":
images = resized_mm_data.get("image", [])
request = ChatCompletionRequest(messages=[
UserMessage(content=[
TextChunk(text=""),
*(ImageChunk(image=image) for image in images),
]),
])
request = ChatCompletionRequest(
messages=[
UserMessage(
content=[
TextChunk(text=""),
*(ImageChunk(image=image) for image in images),
]
),
]
)
tokenizer = processing_info.get_tokenizer()
res = tokenizer.mistral.encode_chat_completion(request)
prompt = res.tokens
@@ -133,10 +143,7 @@ def create_batched_mm_kwargs(
hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
tokenization_kwargs=processor_inputs.tokenization_kwargs,
)["mm_kwargs"].require_data()
items = [
item for modality in supported_mm_limits
for item in mm_kwargs[modality]
]
items = [item for modality in supported_mm_limits for item in mm_kwargs[modality]]
return group_mm_kwargs_by_modality(
items,
merge_by_field_config=model_cls.merge_by_field_config,
@@ -167,15 +174,17 @@ def initialize_dummy_model(
cleanup_dist_env_and_memory()
def get_model_id_to_test(
model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
def get_model_id_to_test(model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
filtered_results = []
for model_arch in model_arch_list:
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
if model_info.extras and model_arch in ARCH_NEEDS_EXTRAS:
available_repos = list(
map(lambda model_id: (model_arch, model_id),
[model_info.default, *model_info.extras.values()]))
map(
lambda model_id: (model_arch, model_id),
[model_info.default, *model_info.extras.values()],
)
)
filtered_results.extend(available_repos)
else:
filtered_results.append((model_arch, model_info.default))
@@ -183,8 +192,8 @@ def get_model_id_to_test(
@pytest.mark.parametrize(
"model_arch, model_id",
get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
"model_arch, model_id", get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys())
)
def test_model_tensor_schema(model_arch: str, model_id: str):
if model_arch in ARCH_TO_SKIP:
pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
@@ -193,12 +202,13 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip",
check_max_version=False)
model_info.check_transformers_version(on_fail="skip", check_max_version=False)
hf_overrides_fn = partial(dummy_hf_overrides,
model_arch=model_arch,
exist_overrides=model_info.hf_overrides)
hf_overrides_fn = partial(
dummy_hf_overrides,
model_arch=model_arch,
exist_overrides=model_info.hf_overrides,
)
model_config = ModelConfig(
model_id,
@@ -256,8 +266,11 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
with initialize_dummy_model(model_cls, model_config) as model:
for modality, _, mm_kwargs in create_batched_mm_kwargs(
model_cls, model_config, processor):
model_cls, model_config, processor
):
for method_name in inputs_parse_methods:
print(f"Testing `{method_name}` with modality={modality} "
f"and mm_kwargs{list(mm_kwargs.keys())}")
print(
f"Testing `{method_name}` with modality={modality} "
f"and mm_kwargs{list(mm_kwargs.keys())}"
)
getattr(model, method_name)(modality=modality, **mm_kwargs)