Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -6,22 +6,27 @@ from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
|
||||
UserMessage)
|
||||
from mistral_common.protocol.instruct.messages import ImageChunk, TextChunk, UserMessage
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
|
||||
ImageDummyOptions, VideoDummyOptions)
|
||||
from vllm.config.multimodal import (
|
||||
AudioDummyOptions,
|
||||
BaseDummyOptions,
|
||||
ImageDummyOptions,
|
||||
VideoDummyOptions,
|
||||
)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
|
||||
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
|
||||
from vllm.multimodal.inputs import MultiModalInputs
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
InputProcessingContext)
|
||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
|
||||
cached_tokenizer_from_config,
|
||||
encode_tokens)
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
||||
from vllm.transformers_utils.tokenizer import (
|
||||
AnyTokenizer,
|
||||
MistralTokenizer,
|
||||
cached_tokenizer_from_config,
|
||||
encode_tokens,
|
||||
)
|
||||
|
||||
from ....multimodal.utils import random_audio, random_image, random_video
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
@@ -36,14 +41,17 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
|
||||
# GLM4.1V doesn't support multiple videos
|
||||
video = mm_data["video"]
|
||||
num_frames = len(video)
|
||||
mm_data["video"] = (video, {
|
||||
"total_num_frames": num_frames,
|
||||
"fps": num_frames,
|
||||
"duration": 1,
|
||||
"frames_indices": [i for i in range(num_frames)],
|
||||
"video_backend": "opencv",
|
||||
"do_sample_frames": True,
|
||||
})
|
||||
mm_data["video"] = (
|
||||
video,
|
||||
{
|
||||
"total_num_frames": num_frames,
|
||||
"fps": num_frames,
|
||||
"duration": 1,
|
||||
"frames_indices": [i for i in range(num_frames)],
|
||||
"video_backend": "opencv",
|
||||
"do_sample_frames": True,
|
||||
},
|
||||
)
|
||||
return mm_data
|
||||
|
||||
|
||||
@@ -102,7 +110,8 @@ def _test_processing_correctness(
|
||||
mm_processor_cache_gb=2048,
|
||||
skip_tokenizer_init=model_info.skip_tokenizer_init,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype)
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
|
||||
@@ -145,27 +154,22 @@ def _test_processing_correctness(
|
||||
input_to_hit = {
|
||||
"image": Image.new("RGB", size=(128, 128)),
|
||||
"video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
|
||||
"audio": (np.zeros((512, )), 16000),
|
||||
"audio": (np.zeros((512,)), 16000),
|
||||
}
|
||||
input_factory = {
|
||||
"image":
|
||||
partial(random_image, rng, min_wh=128, max_wh=256),
|
||||
"video":
|
||||
partial(random_video,
|
||||
rng,
|
||||
min_frames=2,
|
||||
max_frames=16,
|
||||
min_wh=128,
|
||||
max_wh=256),
|
||||
"audio":
|
||||
partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
|
||||
"image": partial(random_image, rng, min_wh=128, max_wh=256),
|
||||
"video": partial(
|
||||
random_video, rng, min_frames=2, max_frames=16, min_wh=128, max_wh=256
|
||||
),
|
||||
"audio": partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
|
||||
}
|
||||
|
||||
for batch_idx in range(num_batches):
|
||||
mm_data = {
|
||||
k:
|
||||
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
|
||||
for _ in range(rng.randint(limit + 1))]
|
||||
k: [
|
||||
(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
|
||||
for _ in range(rng.randint(limit + 1))
|
||||
]
|
||||
for k, limit in limit_mm_per_prompt_ints.items()
|
||||
}
|
||||
|
||||
@@ -174,12 +178,16 @@ def _test_processing_correctness(
|
||||
# Mistral chat outputs tokens directly, rather than text prompts
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
images = mm_data.get("image", [])
|
||||
request = ChatCompletionRequest(messages=[
|
||||
UserMessage(content=[
|
||||
TextChunk(text=""),
|
||||
*(ImageChunk(image=image) for image in images),
|
||||
]),
|
||||
])
|
||||
request = ChatCompletionRequest(
|
||||
messages=[
|
||||
UserMessage(
|
||||
content=[
|
||||
TextChunk(text=""),
|
||||
*(ImageChunk(image=image) for image in images),
|
||||
]
|
||||
),
|
||||
]
|
||||
)
|
||||
res = tokenizer.mistral.encode_chat_completion(request)
|
||||
prompt = res.tokens
|
||||
else:
|
||||
@@ -303,16 +311,14 @@ def _test_processing_correctness_one(
|
||||
baseline_text_result,
|
||||
baseline_tokenized_result,
|
||||
ignore_mm_keys=ignore_mm_keys,
|
||||
msg=f"Failed ({batch_idx=}, {text_prompt=}, "
|
||||
f"{token_prompt=}, {mm_data=})",
|
||||
msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
|
||||
)
|
||||
|
||||
_assert_inputs_equal(
|
||||
cached_text_result,
|
||||
cached_tokenized_result,
|
||||
ignore_mm_keys=ignore_mm_keys,
|
||||
msg=f"Failed ({batch_idx=}, {text_prompt=}, "
|
||||
f"{token_prompt=}, {mm_data=})",
|
||||
msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -24,7 +24,8 @@ from ...utils import build_model_context
|
||||
# post-sampled frames (expected behavior)
|
||||
(-1, 1, 5),
|
||||
(-1, 2, 10),
|
||||
])
|
||||
],
|
||||
)
|
||||
def test_processor_override(
|
||||
model_id: str,
|
||||
expected_toks_per_frame: int,
|
||||
@@ -55,10 +56,8 @@ def test_processor_override(
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
|
||||
video_tok_count = processed_inputs["prompt_token_ids"].count(
|
||||
video_token_id)
|
||||
grid_t, _, _ = processed_inputs["mm_kwargs"].get_data(
|
||||
)["video_grid_thw"][0]
|
||||
video_tok_count = processed_inputs["prompt_token_ids"].count(video_token_id)
|
||||
grid_t, _, _ = processed_inputs["mm_kwargs"].get_data()["video_grid_thw"][0]
|
||||
|
||||
assert grid_t == expected_grid_t
|
||||
assert video_tok_count == expected_toks_per_frame * grid_t
|
||||
@@ -71,7 +70,7 @@ def test_video_loader_consistency(
|
||||
fps: int,
|
||||
):
|
||||
"""
|
||||
Ensure dynamic video loader (pre-sampled by loader) and normal video
|
||||
Ensure dynamic video loader (pre-sampled by loader) and normal video
|
||||
loader (post-sampled by processor) produce same video processing outputs.
|
||||
"""
|
||||
ctx = build_model_context(
|
||||
@@ -91,7 +90,8 @@ def test_video_loader_consistency(
|
||||
|
||||
static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
|
||||
dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
|
||||
video_bytes, fps=fps)
|
||||
video_bytes, fps=fps
|
||||
)
|
||||
|
||||
# pre-sampled loader shouldn't read all frames
|
||||
assert len(dynamic_video) < len(static_video)
|
||||
@@ -99,12 +99,11 @@ def test_video_loader_consistency(
|
||||
static_mm_data = {"video": [(static_video, static_metadata)]}
|
||||
dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
|
||||
|
||||
static_outputs = processor.apply(prompt, static_mm_data,
|
||||
hf_processor_mm_kwargs)
|
||||
dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
|
||||
hf_processor_mm_kwargs)
|
||||
static_outputs = processor.apply(prompt, static_mm_data, hf_processor_mm_kwargs)
|
||||
dynamic_outputs = processor.apply(prompt, dynamic_mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
assert static_outputs["prompt_token_ids"] == dynamic_outputs[
|
||||
"prompt_token_ids"]
|
||||
assert static_outputs["mm_kwargs"].get_data(
|
||||
) == dynamic_outputs["mm_kwargs"].get_data()
|
||||
assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"]
|
||||
assert (
|
||||
static_outputs["mm_kwargs"].get_data()
|
||||
== dynamic_outputs["mm_kwargs"].get_data()
|
||||
)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for H2OVL's multimodal preprocessing kwargs."""
|
||||
|
||||
from collections.abc import Mapping
|
||||
from typing import Optional
|
||||
|
||||
@@ -23,8 +24,10 @@ def _get_expected_num_patches(
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
):
|
||||
from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
|
||||
get_h2ovl_target_ratios)
|
||||
from vllm.model_executor.models.h2ovl import (
|
||||
calculate_h2ovl_targets,
|
||||
get_h2ovl_target_ratios,
|
||||
)
|
||||
|
||||
width, height = image.size
|
||||
|
||||
@@ -101,24 +104,27 @@ def _run_check(
|
||||
|
||||
total_expected_num_patches = sum(
|
||||
_get_expected_num_patches(config, image, len(images), min_num, max_num)
|
||||
for image in images)
|
||||
for image in images
|
||||
)
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data(
|
||||
)["pixel_values_flat"].shape
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
|
||||
|
||||
assert img_tok_count == 256 * total_expected_num_patches
|
||||
assert pixel_shape[0] == total_expected_num_patches
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", [
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
"h2oai/h2ovl-mississippi-2b",
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"model_id",
|
||||
[
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
"h2oai/h2ovl-mississippi-2b",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
@@ -165,10 +171,7 @@ def test_processor_override(
|
||||
|
||||
_run_check(
|
||||
processor,
|
||||
[
|
||||
rescale_image_size(image_assets[0].pil_image, f)
|
||||
for f in size_factors
|
||||
],
|
||||
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
|
||||
min_num,
|
||||
max_num,
|
||||
hf_processor_mm_kwargs,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for Idefics3's multimodal preprocessing kwargs."""
|
||||
|
||||
import pytest
|
||||
from transformers import Idefics3Config
|
||||
|
||||
@@ -17,7 +18,8 @@ from ...utils import build_model_context
|
||||
[
|
||||
({"size": {"longest_edge": 364}}, 169),
|
||||
({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
|
||||
])
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
@@ -42,8 +44,11 @@ def test_processor_override(
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
placeholders = "<image>" if num_imgs == 1 else "\n".join(
|
||||
f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
|
||||
placeholders = (
|
||||
"<image>"
|
||||
if num_imgs == 1
|
||||
else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
|
||||
)
|
||||
prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
|
||||
|
||||
# Build mm_data
|
||||
@@ -57,8 +62,7 @@ def test_processor_override(
|
||||
# Ensure the placeholders format are correct
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
|
||||
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
|
||||
"input_ids"][0]
|
||||
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = ctx.get_hf_config().image_token_id
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for InternVL's multimodal preprocessing kwargs."""
|
||||
|
||||
from collections.abc import Mapping
|
||||
from typing import Optional
|
||||
|
||||
@@ -24,7 +25,9 @@ def _get_expected_num_patches(
|
||||
max_num: int,
|
||||
):
|
||||
from vllm.model_executor.models.internvl import (
|
||||
calculate_internvl_targets, get_internvl_target_ratios)
|
||||
calculate_internvl_targets,
|
||||
get_internvl_target_ratios,
|
||||
)
|
||||
|
||||
width, height = image.size
|
||||
|
||||
@@ -61,15 +64,15 @@ def _run_check(
|
||||
|
||||
total_expected_num_patches = sum(
|
||||
_get_expected_num_patches(config, image, len(images), min_num, max_num)
|
||||
for image in images)
|
||||
for image in images
|
||||
)
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data(
|
||||
)["pixel_values_flat"].shape
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
|
||||
|
||||
assert img_tok_count == 256 * total_expected_num_patches
|
||||
assert pixel_shape[0] == total_expected_num_patches
|
||||
@@ -122,10 +125,7 @@ def test_processor_override(
|
||||
|
||||
_run_check(
|
||||
processor,
|
||||
[
|
||||
rescale_image_size(image_assets[0].pil_image, f)
|
||||
for f in size_factors
|
||||
],
|
||||
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
|
||||
min_num,
|
||||
max_num,
|
||||
hf_processor_mm_kwargs,
|
||||
|
||||
@@ -11,8 +11,7 @@ from ....conftest import ImageTestAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
|
||||
@pytest.mark.parametrize("model_id", ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
|
||||
@pytest.mark.parametrize("mm_processor_kwargs", [{}])
|
||||
@pytest.mark.parametrize("num_imgs", [1, 5])
|
||||
@pytest.mark.parametrize("mm_processor_cache_gb", [0, 4])
|
||||
@@ -38,13 +37,14 @@ def test_processor_override(
|
||||
hf_processor = processor.info.get_hf_processor()
|
||||
vocab = tokenizer.get_vocab()
|
||||
|
||||
prompt = "<|begin_of_text|><|header_start|>user<|header_end|>" \
|
||||
+ "<|image|>" * num_imgs \
|
||||
prompt = (
|
||||
"<|begin_of_text|><|header_start|>user<|header_end|>"
|
||||
+ "<|image|>" * num_imgs
|
||||
+ "<|eot|><|header_start|>assistant<|header_end|>"
|
||||
)
|
||||
mm_data = {
|
||||
"image": [
|
||||
image_assets[(i % len(image_assets))].pil_image
|
||||
for i in range(num_imgs)
|
||||
image_assets[(i % len(image_assets))].pil_image for i in range(num_imgs)
|
||||
]
|
||||
}
|
||||
if tokenized_prompt:
|
||||
@@ -64,22 +64,23 @@ def test_processor_override(
|
||||
if tiles_x * tiles_y > 1:
|
||||
num_x_separators += (tiles_x - 1) * tiles_y
|
||||
num_y_separators += tiles_y
|
||||
assert prompt_token_ids.count(vocab[hf_processor.tile_token]) \
|
||||
== num_x_separators
|
||||
assert prompt_token_ids.count(vocab[hf_processor.tile_global_token]) \
|
||||
== num_y_separators
|
||||
assert prompt_token_ids.count(vocab[hf_processor.tile_token]) == num_x_separators
|
||||
assert (
|
||||
prompt_token_ids.count(vocab[hf_processor.tile_global_token])
|
||||
== num_y_separators
|
||||
)
|
||||
|
||||
# image token offsets
|
||||
img_locs = processed_inputs["mm_placeholders"].get("image", [])
|
||||
assert len(img_locs) == num_imgs
|
||||
assert [img_loc.offset for img_loc in img_locs] == \
|
||||
[i for i, v in enumerate(prompt_token_ids) \
|
||||
if v == config.boi_token_index]
|
||||
assert [img_loc.offset for img_loc in img_locs] == [
|
||||
i for i, v in enumerate(prompt_token_ids) if v == config.boi_token_index
|
||||
]
|
||||
|
||||
# patch sizes and masks
|
||||
num_patches_per_chunk = processor.info.get_patch_per_chunk(
|
||||
config.vision_config)
|
||||
assert prompt_token_ids.count(config.image_token_index) \
|
||||
num_patches_per_chunk = processor.info.get_patch_per_chunk(config.vision_config)
|
||||
assert (
|
||||
prompt_token_ids.count(config.image_token_index)
|
||||
== sum(mm_data["patches_per_image"]) * num_patches_per_chunk
|
||||
assert len(mm_data["pixel_values"]) \
|
||||
== sum(mm_data["patches_per_image"])
|
||||
)
|
||||
assert len(mm_data["pixel_values"]) == sum(mm_data["patches_per_image"])
|
||||
|
||||
@@ -22,8 +22,9 @@ def _validate_image_max_tokens_one(
|
||||
image_size: ImageSize,
|
||||
) -> None:
|
||||
info = processor.info
|
||||
feature_size = info.get_num_image_tokens(image_width=image_size.width,
|
||||
image_height=image_size.height)
|
||||
feature_size = info.get_num_image_tokens(
|
||||
image_width=image_size.width, image_height=image_size.height
|
||||
)
|
||||
|
||||
try:
|
||||
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
|
||||
@@ -31,8 +32,9 @@ def _validate_image_max_tokens_one(
|
||||
failed_size_excs.append((image_size, exc))
|
||||
|
||||
|
||||
@pytest.mark.skip("This test takes around 5 minutes to run. "
|
||||
"Comment this out to run it manually.")
|
||||
@pytest.mark.skip(
|
||||
"This test takes around 5 minutes to run. Comment this out to run it manually."
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||
def test_processor_max_tokens(model_id):
|
||||
ctx = build_model_context(
|
||||
@@ -66,9 +68,9 @@ def test_processor_max_tokens(model_id):
|
||||
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" \
|
||||
+ "\n========\n".join(f"[{size}]\n{exc}"
|
||||
for size, exc in failed_size_excs)
|
||||
msg = "Found failing image sizes:" + "\n========\n".join(
|
||||
f"[{size}]\n{exc}" for size, exc in failed_size_excs
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
@@ -94,8 +96,10 @@ def _validate_image_prompt_replacements_one(
|
||||
|
||||
# NOTE: There is a BOS token
|
||||
assert first_placeholder.offset == 1
|
||||
assert first_placeholder.length == (
|
||||
len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
|
||||
assert (
|
||||
first_placeholder.length
|
||||
== (len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
failed_size_excs.append((image_size, exc))
|
||||
@@ -122,9 +126,9 @@ def _test_image_prompt_replacements(
|
||||
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" \
|
||||
+ "\n========\n".join(f"[{size}]\n{exc}"
|
||||
for size, exc in failed_size_excs)
|
||||
msg = "Found failing image sizes:" + "\n========\n".join(
|
||||
f"[{size}]\n{exc}" for size, exc in failed_size_excs
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
@@ -138,11 +142,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
|
||||
(488, 183), (2560, 1669)]
|
||||
image_ratios = [
|
||||
(171, 152),
|
||||
(184, 161),
|
||||
(198, 176),
|
||||
(333, 296),
|
||||
(369, 328),
|
||||
(488, 183),
|
||||
(2560, 1669),
|
||||
]
|
||||
image_sizes = [
|
||||
size for w, h in image_ratios
|
||||
for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
]
|
||||
|
||||
_test_image_prompt_replacements(
|
||||
@@ -152,8 +162,9 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip("This test takes around 2 hours to run. "
|
||||
"Comment this out to run it manually.")
|
||||
@pytest.mark.skip(
|
||||
"This test takes around 2 hours to run. Comment this out to run it manually."
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||
@pytest.mark.parametrize("num_imgs", [1])
|
||||
def test_processor_prompt_replacements_all(model_id, num_imgs):
|
||||
|
||||
@@ -22,8 +22,9 @@ def _validate_image_max_tokens_one(
|
||||
image_size: ImageSize,
|
||||
) -> None:
|
||||
info = processor.info
|
||||
feature_size = info.get_num_image_tokens(image_width=image_size.width,
|
||||
image_height=image_size.height)
|
||||
feature_size = info.get_num_image_tokens(
|
||||
image_width=image_size.width, image_height=image_size.height
|
||||
)
|
||||
|
||||
try:
|
||||
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
|
||||
@@ -31,10 +32,10 @@ def _validate_image_max_tokens_one(
|
||||
failed_size_excs.append((image_size, exc))
|
||||
|
||||
|
||||
@pytest.mark.skip("This test takes around 5 minutes to run. "
|
||||
"Comment this out to run it manually.")
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
@pytest.mark.skip(
|
||||
"This test takes around 5 minutes to run. Comment this out to run it manually."
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
def test_processor_max_tokens(model_id):
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
@@ -67,9 +68,9 @@ def test_processor_max_tokens(model_id):
|
||||
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" \
|
||||
+ "\n========\n".join(f"[{size}]\n{exc}"
|
||||
for size, exc in failed_size_excs)
|
||||
msg = "Found failing image sizes:" + "\n========\n".join(
|
||||
f"[{size}]\n{exc}" for size, exc in failed_size_excs
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
@@ -94,8 +95,10 @@ def _validate_image_prompt_replacements_one(
|
||||
first_placeholder = image_placeholders[0]
|
||||
|
||||
assert first_placeholder.offset == 0
|
||||
assert first_placeholder.length == len(
|
||||
processed_inputs["prompt_token_ids"]) // num_imgs
|
||||
assert (
|
||||
first_placeholder.length
|
||||
== len(processed_inputs["prompt_token_ids"]) // num_imgs
|
||||
)
|
||||
except Exception as exc:
|
||||
failed_size_excs.append((image_size, exc))
|
||||
|
||||
@@ -121,14 +124,13 @@ def _test_image_prompt_replacements(
|
||||
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" \
|
||||
+ "\n========\n".join(f"[{size}]\n{exc}"
|
||||
for size, exc in failed_size_excs)
|
||||
msg = "Found failing image sizes:" + "\n========\n".join(
|
||||
f"[{size}]\n{exc}" for size, exc in failed_size_excs
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
@@ -138,11 +140,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
|
||||
(488, 183), (2560, 1669)]
|
||||
image_ratios = [
|
||||
(171, 152),
|
||||
(184, 161),
|
||||
(198, 176),
|
||||
(333, 296),
|
||||
(369, 328),
|
||||
(488, 183),
|
||||
(2560, 1669),
|
||||
]
|
||||
image_sizes = [
|
||||
size for w, h in image_ratios
|
||||
for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
]
|
||||
|
||||
_test_image_prompt_replacements(
|
||||
@@ -152,10 +160,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip("This test takes around 2 hours to run. "
|
||||
"Comment this out to run it manually.")
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
@pytest.mark.skip(
|
||||
"This test takes around 2 hours to run. Comment this out to run it manually."
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
@pytest.mark.parametrize("num_imgs", [1])
|
||||
def test_processor_prompt_replacements_all(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
|
||||
@@ -61,17 +61,17 @@ def _test_image_prompt_replacements(
|
||||
num_imgs: int,
|
||||
image_sizes: list[ImageSize],
|
||||
) -> None:
|
||||
|
||||
failed_size_excs = list[tuple[ImageSize, Exception]]()
|
||||
|
||||
for size in image_sizes:
|
||||
_validate_image_prompt_replacements_one(processor, num_imgs,
|
||||
failed_size_excs, size)
|
||||
_validate_image_prompt_replacements_one(
|
||||
processor, num_imgs, failed_size_excs, size
|
||||
)
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" \
|
||||
+ "\n========\n".join(f"[{size}]\n{exc}"
|
||||
for size, exc in failed_size_excs)
|
||||
msg = "Found failing image sizes:" + "\n========\n".join(
|
||||
f"[{size}]\n{exc}" for size, exc in failed_size_excs
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
@@ -85,11 +85,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
|
||||
(488, 183), (2560, 1669)]
|
||||
image_ratios = [
|
||||
(171, 152),
|
||||
(184, 161),
|
||||
(198, 176),
|
||||
(333, 296),
|
||||
(369, 328),
|
||||
(488, 183),
|
||||
(2560, 1669),
|
||||
]
|
||||
image_sizes = [
|
||||
size for w, h in image_ratios
|
||||
for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
]
|
||||
|
||||
_test_image_prompt_replacements(
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for mllama's multimodal preprocessing and profiling."""
|
||||
|
||||
import pytest
|
||||
from torch import prod
|
||||
from transformers import Llama4Config
|
||||
@@ -47,14 +48,17 @@ def test_profiling(model_id: str, max_model_len: int):
|
||||
image_size = hf_config.vision_config.image_size
|
||||
patch_size = hf_config.vision_config.patch_size
|
||||
downsample_ratio = int(
|
||||
round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2)))
|
||||
tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio
|
||||
round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))
|
||||
)
|
||||
tokens_per_patch = ((image_size // patch_size) ** 2) // downsample_ratio
|
||||
chunks_per_image = prod(mm_data["patches_per_image"])
|
||||
total_num_patches = chunks_per_image * tokens_per_patch
|
||||
num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][
|
||||
1] # x-y separator tokens
|
||||
total_tokens = total_num_patches.item() + num_tiles.item(
|
||||
) + 3 # image start, image, image end
|
||||
num_tiles = (
|
||||
mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][1]
|
||||
) # x-y separator tokens
|
||||
total_tokens = (
|
||||
total_num_patches.item() + num_tiles.item() + 3
|
||||
) # image start, image, image end
|
||||
|
||||
profiled_tokens = profiler.get_mm_max_contiguous_tokens(
|
||||
max_model_len,
|
||||
@@ -63,5 +67,6 @@ def test_profiling(model_id: str, max_model_len: int):
|
||||
|
||||
assert total_tokens == profiled_tokens["image"]
|
||||
assert total_tokens == sum(
|
||||
placeholder.length for placeholder in
|
||||
decoder_dummy_data.multi_modal_placeholders["image"])
|
||||
placeholder.length
|
||||
for placeholder in decoder_dummy_data.multi_modal_placeholders["image"]
|
||||
)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
|
||||
|
||||
from collections.abc import Mapping
|
||||
from typing import Optional
|
||||
|
||||
@@ -24,7 +25,9 @@ def _get_expected_num_patches(
|
||||
max_num: int,
|
||||
):
|
||||
from vllm.model_executor.models.nemotron_vl import (
|
||||
calculate_nemotron_vl_targets, get_nemotron_vl_target_ratios)
|
||||
calculate_nemotron_vl_targets,
|
||||
get_nemotron_vl_target_ratios,
|
||||
)
|
||||
|
||||
width, height = image.size
|
||||
|
||||
@@ -63,22 +66,21 @@ def _run_check(
|
||||
|
||||
total_expected_num_patches = sum(
|
||||
_get_expected_num_patches(config, image, len(images), min_num, max_num)
|
||||
for image in images)
|
||||
for image in images
|
||||
)
|
||||
print(total_expected_num_patches)
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.convert_tokens_to_ids("<image>")
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data(
|
||||
)["pixel_values_flat"].shape
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
|
||||
print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
|
||||
assert img_tok_count == 256 * total_expected_num_patches
|
||||
assert pixel_shape[0] == total_expected_num_patches
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
|
||||
@pytest.mark.parametrize("model_id", ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
@@ -125,10 +127,7 @@ def test_processor_override(
|
||||
|
||||
_run_check(
|
||||
processor,
|
||||
[
|
||||
rescale_image_size(image_assets[0].pil_image, f)
|
||||
for f in size_factors
|
||||
],
|
||||
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
|
||||
min_num,
|
||||
max_num,
|
||||
hf_processor_mm_kwargs,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for phi3v's multimodal preprocessing kwargs."""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
@@ -18,7 +19,8 @@ from ...utils import build_model_context
|
||||
({"num_crops": 16}, 1921),
|
||||
# the default num_crops of phi-3.5-vision is 4
|
||||
({}, 757),
|
||||
])
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for phi4mm's multimodal preprocessing kwargs."""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
@@ -18,7 +19,8 @@ from ...utils import build_model_context
|
||||
({"dynamic_hd": 16}, 4433),
|
||||
# the default num_crops of phi-4-multimodal is 36
|
||||
({}, 9585),
|
||||
])
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
@@ -46,8 +48,7 @@ def test_processor_override(
|
||||
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
|
||||
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
|
||||
|
||||
image_size = ctx.get_hf_config(
|
||||
).embd_layer["image_embd_layer"]["crop_size"]
|
||||
image_size = ctx.get_hf_config().embd_layer["image_embd_layer"]["crop_size"]
|
||||
dummy_image_size = (image_size * 7, image_size * 7)
|
||||
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
|
||||
mm_data = {"image": [dummy_image] * num_imgs}
|
||||
@@ -56,5 +57,6 @@ def test_processor_override(
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(
|
||||
_IMAGE_PLACEHOLDER_TOKEN_ID)
|
||||
_IMAGE_PLACEHOLDER_TOKEN_ID
|
||||
)
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
|
||||
@@ -12,10 +12,12 @@ from ...utils import build_model_context
|
||||
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [
|
||||
("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"),
|
||||
[
|
||||
({}, 1426, (5704, 1176)),
|
||||
({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
|
||||
])
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
@@ -48,8 +50,7 @@ def test_processor_override(
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data(
|
||||
)["pixel_values"].shape
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values"].shape
|
||||
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for smolvlm's multimodal preprocessing kwargs."""
|
||||
|
||||
import pytest
|
||||
from transformers import SmolVLMConfig
|
||||
|
||||
@@ -17,7 +18,8 @@ from ...utils import build_model_context
|
||||
[
|
||||
({"max_image_size": {"longest_edge": 384}}, 1377),
|
||||
({"max_image_size": {"longest_edge": 768}}, 405),
|
||||
])
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
@@ -42,8 +44,11 @@ def test_processor_override(
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
placeholders = "<image>" if num_imgs == 1 else "\n".join(
|
||||
f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
|
||||
placeholders = (
|
||||
"<image>"
|
||||
if num_imgs == 1
|
||||
else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
|
||||
)
|
||||
prompt = f"<|im_start|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
|
||||
|
||||
# Build mm_data
|
||||
@@ -57,8 +62,7 @@ def test_processor_override(
|
||||
# Ensure the placeholders format are correct
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
|
||||
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
|
||||
"input_ids"][0]
|
||||
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = ctx.get_hf_config().image_token_id
|
||||
|
||||
@@ -9,23 +9,29 @@ from typing import Any, Union
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch.nn as nn
|
||||
from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
|
||||
UserMessage)
|
||||
from mistral_common.protocol.instruct.messages import ImageChunk, TextChunk, UserMessage
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
|
||||
ImageDummyOptions, VideoDummyOptions)
|
||||
from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel)
|
||||
from vllm.config.multimodal import (
|
||||
AudioDummyOptions,
|
||||
BaseDummyOptions,
|
||||
ImageDummyOptions,
|
||||
VideoDummyOptions,
|
||||
)
|
||||
from vllm.distributed import (
|
||||
cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||
from vllm.model_executor.models.interfaces import (SupportsMultiModal,
|
||||
supports_multimodal)
|
||||
from vllm.model_executor.models.interfaces import (
|
||||
SupportsMultiModal,
|
||||
supports_multimodal,
|
||||
)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
InputProcessingContext)
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
||||
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
from vllm.utils import is_list_of
|
||||
@@ -48,13 +54,15 @@ REPO_ID_TO_SKIP = {
|
||||
}
|
||||
|
||||
ImageInput = list[Image.Image]
|
||||
VideoInput = Union[list[Image.Image], list[np.ndarray],
|
||||
list[tuple[np.ndarray, dict[str, Any]]]]
|
||||
VideoInput = Union[
|
||||
list[Image.Image], list[np.ndarray], list[tuple[np.ndarray, dict[str, Any]]]
|
||||
]
|
||||
AudioInput = list[tuple[np.ndarray, int]]
|
||||
|
||||
|
||||
def _resize_data(_data: Union[Image.Image, np.ndarray],
|
||||
size_factor: float) -> Union[Image.Image, np.ndarray]:
|
||||
def _resize_data(
|
||||
_data: Union[Image.Image, np.ndarray], size_factor: float
|
||||
) -> Union[Image.Image, np.ndarray]:
|
||||
assert size_factor <= 1, "Size factor must be less than 1"
|
||||
# Image input
|
||||
if isinstance(_data, Image.Image):
|
||||
@@ -74,20 +82,18 @@ def _resize_data(_data: Union[Image.Image, np.ndarray],
|
||||
return _data[..., :T, :H, :W, :C]
|
||||
# Audio input
|
||||
elif isinstance(_data, np.ndarray) and _data.ndim == 1:
|
||||
return _data[:int(len(_data) * size_factor)]
|
||||
return _data[: int(len(_data) * size_factor)]
|
||||
raise AssertionError("This line should be unreachable.")
|
||||
|
||||
|
||||
def resize_mm_data(
|
||||
data: Union[ImageInput, VideoInput, AudioInput],
|
||||
size_factors: tuple[float,
|
||||
...]) -> Union[ImageInput, VideoInput, AudioInput]:
|
||||
size_factors = size_factors[:len(data)]
|
||||
data: Union[ImageInput, VideoInput, AudioInput], size_factors: tuple[float, ...]
|
||||
) -> Union[ImageInput, VideoInput, AudioInput]:
|
||||
size_factors = size_factors[: len(data)]
|
||||
if is_list_of(data, (Image.Image, np.ndarray, list)):
|
||||
return [_resize_data(d, s) for d, s in zip(data, size_factors)]
|
||||
elif is_list_of(data, tuple):
|
||||
return [(_resize_data(d, s), meta)
|
||||
for (d, meta), s in zip(data, size_factors)]
|
||||
return [(_resize_data(d, s), meta) for (d, meta), s in zip(data, size_factors)]
|
||||
raise ValueError("Unsupported multimodal data type.")
|
||||
|
||||
|
||||
@@ -116,12 +122,16 @@ def create_batched_mm_kwargs(
|
||||
# Mistral chat outputs tokens directly, rather than text prompts
|
||||
if model_config.tokenizer_mode == "mistral":
|
||||
images = resized_mm_data.get("image", [])
|
||||
request = ChatCompletionRequest(messages=[
|
||||
UserMessage(content=[
|
||||
TextChunk(text=""),
|
||||
*(ImageChunk(image=image) for image in images),
|
||||
]),
|
||||
])
|
||||
request = ChatCompletionRequest(
|
||||
messages=[
|
||||
UserMessage(
|
||||
content=[
|
||||
TextChunk(text=""),
|
||||
*(ImageChunk(image=image) for image in images),
|
||||
]
|
||||
),
|
||||
]
|
||||
)
|
||||
tokenizer = processing_info.get_tokenizer()
|
||||
res = tokenizer.mistral.encode_chat_completion(request)
|
||||
prompt = res.tokens
|
||||
@@ -133,10 +143,7 @@ def create_batched_mm_kwargs(
|
||||
hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=processor_inputs.tokenization_kwargs,
|
||||
)["mm_kwargs"].require_data()
|
||||
items = [
|
||||
item for modality in supported_mm_limits
|
||||
for item in mm_kwargs[modality]
|
||||
]
|
||||
items = [item for modality in supported_mm_limits for item in mm_kwargs[modality]]
|
||||
return group_mm_kwargs_by_modality(
|
||||
items,
|
||||
merge_by_field_config=model_cls.merge_by_field_config,
|
||||
@@ -167,15 +174,17 @@ def initialize_dummy_model(
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
def get_model_id_to_test(
|
||||
model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
|
||||
def get_model_id_to_test(model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
|
||||
filtered_results = []
|
||||
for model_arch in model_arch_list:
|
||||
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
|
||||
if model_info.extras and model_arch in ARCH_NEEDS_EXTRAS:
|
||||
available_repos = list(
|
||||
map(lambda model_id: (model_arch, model_id),
|
||||
[model_info.default, *model_info.extras.values()]))
|
||||
map(
|
||||
lambda model_id: (model_arch, model_id),
|
||||
[model_info.default, *model_info.extras.values()],
|
||||
)
|
||||
)
|
||||
filtered_results.extend(available_repos)
|
||||
else:
|
||||
filtered_results.append((model_arch, model_info.default))
|
||||
@@ -183,8 +192,8 @@ def get_model_id_to_test(
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_arch, model_id",
|
||||
get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
|
||||
"model_arch, model_id", get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys())
|
||||
)
|
||||
def test_model_tensor_schema(model_arch: str, model_id: str):
|
||||
if model_arch in ARCH_TO_SKIP:
|
||||
pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
|
||||
@@ -193,12 +202,13 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
|
||||
|
||||
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip",
|
||||
check_max_version=False)
|
||||
model_info.check_transformers_version(on_fail="skip", check_max_version=False)
|
||||
|
||||
hf_overrides_fn = partial(dummy_hf_overrides,
|
||||
model_arch=model_arch,
|
||||
exist_overrides=model_info.hf_overrides)
|
||||
hf_overrides_fn = partial(
|
||||
dummy_hf_overrides,
|
||||
model_arch=model_arch,
|
||||
exist_overrides=model_info.hf_overrides,
|
||||
)
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
@@ -256,8 +266,11 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
|
||||
|
||||
with initialize_dummy_model(model_cls, model_config) as model:
|
||||
for modality, _, mm_kwargs in create_batched_mm_kwargs(
|
||||
model_cls, model_config, processor):
|
||||
model_cls, model_config, processor
|
||||
):
|
||||
for method_name in inputs_parse_methods:
|
||||
print(f"Testing `{method_name}` with modality={modality} "
|
||||
f"and mm_kwargs{list(mm_kwargs.keys())}")
|
||||
print(
|
||||
f"Testing `{method_name}` with modality={modality} "
|
||||
f"and mm_kwargs{list(mm_kwargs.keys())}"
|
||||
)
|
||||
getattr(model, method_name)(modality=modality, **mm_kwargs)
|
||||
|
||||
Reference in New Issue
Block a user