Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -3,27 +3,40 @@
|
||||
"""Common tests for testing .generate() functionality for single / multiple
|
||||
image, embedding, and video support for different VLMs in vLLM.
|
||||
"""
|
||||
|
||||
import math
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from pathlib import PosixPath
|
||||
|
||||
import pytest
|
||||
from transformers import (AutoModel, AutoModelForImageTextToText,
|
||||
AutoModelForTextToWaveform)
|
||||
from transformers import (
|
||||
AutoModel,
|
||||
AutoModelForImageTextToText,
|
||||
AutoModelForTextToWaveform,
|
||||
)
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import identity
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner,
|
||||
ImageTestAssets, VideoTestAssets, VllmRunner)
|
||||
from ....utils import (create_new_process_for_each_test, large_gpu_mark,
|
||||
multi_gpu_marks)
|
||||
from ....conftest import (
|
||||
IMAGE_ASSETS,
|
||||
AudioTestAssets,
|
||||
HfRunner,
|
||||
ImageTestAssets,
|
||||
VideoTestAssets,
|
||||
VllmRunner,
|
||||
)
|
||||
from ....utils import create_new_process_for_each_test, large_gpu_mark, multi_gpu_marks
|
||||
from ...utils import check_outputs_equal
|
||||
from .vlm_utils import custom_inputs, model_utils, runners
|
||||
from .vlm_utils.case_filtering import get_parametrized_options
|
||||
from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
|
||||
VLMTestInfo, VLMTestType)
|
||||
from .vlm_utils.types import (
|
||||
CustomTestOptions,
|
||||
ExpandableVLMTestArgs,
|
||||
VLMTestInfo,
|
||||
VLMTestType,
|
||||
)
|
||||
|
||||
# This hack is needed for phi3v & paligemma models
|
||||
# ROCm Triton FA can run into shared memory issues with these models,
|
||||
@@ -828,7 +841,7 @@ def _mark_splits(
|
||||
new_test_settings = dict[str, VLMTestInfo]()
|
||||
|
||||
for i in range(num_groups):
|
||||
models_in_group = models[i * split_size:(i + 1) * split_size]
|
||||
models_in_group = models[i * split_size : (i + 1) * split_size]
|
||||
|
||||
for model in models_in_group:
|
||||
for info in test_infos_by_model[model]:
|
||||
@@ -859,7 +872,8 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.IMAGE,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
),
|
||||
)
|
||||
def test_single_image_models(
|
||||
tmp_path: PosixPath,
|
||||
model_type: str,
|
||||
@@ -885,7 +899,8 @@ def test_single_image_models(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.MULTI_IMAGE,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
),
|
||||
)
|
||||
def test_multi_image_models(
|
||||
tmp_path: PosixPath,
|
||||
model_type: str,
|
||||
@@ -911,7 +926,8 @@ def test_multi_image_models(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.EMBEDDING,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
),
|
||||
)
|
||||
def test_image_embedding_models(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@@ -935,7 +951,8 @@ def test_image_embedding_models(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.VIDEO,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
),
|
||||
)
|
||||
def test_video_models(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@@ -959,7 +976,8 @@ def test_video_models(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.AUDIO,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
),
|
||||
)
|
||||
def test_audio_models(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@@ -983,7 +1001,8 @@ def test_audio_models(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
),
|
||||
)
|
||||
def test_custom_inputs_models(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@@ -1006,7 +1025,8 @@ def test_custom_inputs_models(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.IMAGE,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
),
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_single_image_models_heavy(
|
||||
tmp_path: PosixPath,
|
||||
@@ -1033,7 +1053,8 @@ def test_single_image_models_heavy(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.MULTI_IMAGE,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
),
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_multi_image_models_heavy(
|
||||
tmp_path: PosixPath,
|
||||
@@ -1060,7 +1081,8 @@ def test_multi_image_models_heavy(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.EMBEDDING,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
),
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_image_embedding_models_heavy(
|
||||
model_type: str,
|
||||
@@ -1085,7 +1107,8 @@ def test_image_embedding_models_heavy(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.VIDEO,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
),
|
||||
)
|
||||
def test_video_models_heavy(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@@ -1109,7 +1132,8 @@ def test_video_models_heavy(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.AUDIO,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
),
|
||||
)
|
||||
def test_audio_models_heavy(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@@ -1133,7 +1157,8 @@ def test_audio_models_heavy(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
),
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_custom_inputs_models_heavy(
|
||||
model_type: str,
|
||||
|
||||
@@ -10,8 +10,7 @@ from transformers import AutoModelForSpeechSeq2Seq
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput,
|
||||
VllmRunner)
|
||||
from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
@@ -64,50 +63,49 @@ def run_test(
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=1,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"audio": 1},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=64,
|
||||
enforce_eager=True,
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=1,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"audio": 1},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=64,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
lora_request = LoRARequest("audio", 1, audio_lora_path)
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=audios,
|
||||
lora_request=lora_request)
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=audios,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
for prompts, audios in inputs
|
||||
]
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
|
||||
|
||||
with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
|
||||
hf_processor = hf_model.processor
|
||||
eos_token_id = hf_processor.tokenizer.eos_token_id
|
||||
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=[audios],
|
||||
eos_token_id=eos_token_id)
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=[audios],
|
||||
eos_token_id=eos_token_id,
|
||||
)
|
||||
for prompts, audios in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
|
||||
vllm_outputs_per_case):
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(output) for output in vllm_outputs
|
||||
],
|
||||
outputs_1_lst=[vllm_to_hf_output(output) for output in vllm_outputs],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
@@ -118,9 +116,16 @@ def run_test(
|
||||
@pytest.mark.parametrize("max_model_len", [2048])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(hf_runner, vllm_runner, model: str,
|
||||
audio_assets: AudioTestAssets, dtype: str, max_model_len: int,
|
||||
max_tokens: int, num_logprobs: int) -> None:
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model: str,
|
||||
audio_assets: AudioTestAssets,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
@@ -28,8 +28,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
|
||||
give the same result.
|
||||
"""
|
||||
|
||||
image_cherry = convert_image_mode(
|
||||
ImageAsset("cherry_blossom").pil_image, "RGB")
|
||||
image_cherry = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
|
||||
image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
|
||||
images = [image_cherry, image_stop]
|
||||
video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
|
||||
@@ -47,29 +46,30 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
|
||||
),
|
||||
]
|
||||
|
||||
with vllm_runner(model,
|
||||
runner="generate",
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
max_model_len=32768,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=1,
|
||||
enforce_eager=True) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
max_model_len=32768,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=1,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy(prompts,
|
||||
max_tokens,
|
||||
images=images,
|
||||
videos=videos)
|
||||
vllm_model.generate_greedy(
|
||||
prompts, max_tokens, images=images, videos=videos
|
||||
)
|
||||
for prompts, images, videos in inputs
|
||||
]
|
||||
|
||||
all_results = [output[0][1] for output in vllm_outputs_per_case]
|
||||
outputs = [(total_str, total_str.find("assistant\n") + len("assistant\n"))
|
||||
for total_str in all_results]
|
||||
prompt_lengths = [prompt_len for _, prompt_len in outputs]
|
||||
generated_strs = [
|
||||
total_str[prompt_len:] for total_str, prompt_len in outputs
|
||||
outputs = [
|
||||
(total_str, total_str.find("assistant\n") + len("assistant\n"))
|
||||
for total_str in all_results
|
||||
]
|
||||
prompt_lengths = [prompt_len for _, prompt_len in outputs]
|
||||
generated_strs = [total_str[prompt_len:] for total_str, prompt_len in outputs]
|
||||
interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
|
||||
interleaved_output_str, noninterleaved_output_str = generated_strs
|
||||
|
||||
|
||||
@@ -18,13 +18,11 @@ from typing import Any
|
||||
import pytest
|
||||
import torch
|
||||
from safetensors.torch import save_file
|
||||
from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
|
||||
GenerationConfig)
|
||||
from transformers import AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
|
||||
FullAttentionSpec)
|
||||
from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, FullAttentionSpec
|
||||
|
||||
from ....utils import multi_gpu_test
|
||||
|
||||
@@ -93,8 +91,7 @@ def get_rope_layers_config(model_path: str) -> list[int]:
|
||||
|
||||
|
||||
def create_reduced_maverick_model(
|
||||
original_model_name:
|
||||
str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
original_model_name: str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
output_dir: str = "/tmp/reduced_maverick",
|
||||
text_layers: int = 4,
|
||||
num_experts: int = 4,
|
||||
@@ -118,7 +115,8 @@ def create_reduced_maverick_model(
|
||||
|
||||
print(
|
||||
f"Creating reduced Maverick model with {text_layers} text layers and "
|
||||
f"{vision_layers} vision layers...")
|
||||
f"{vision_layers} vision layers..."
|
||||
)
|
||||
|
||||
# Create output directory
|
||||
output_path = Path(output_dir)
|
||||
@@ -126,19 +124,23 @@ def create_reduced_maverick_model(
|
||||
if force_recreate:
|
||||
shutil.rmtree(output_path)
|
||||
else:
|
||||
print(f"Output directory {output_dir} already exists. "
|
||||
"Use --force-recreate to overwrite.")
|
||||
print(
|
||||
f"Output directory {output_dir} already exists. "
|
||||
"Use --force-recreate to overwrite."
|
||||
)
|
||||
return str(output_path)
|
||||
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
print("Loading original model configuration...")
|
||||
original_config = AutoConfig.from_pretrained(original_model_name,
|
||||
trust_remote_code=True)
|
||||
original_config = AutoConfig.from_pretrained(
|
||||
original_model_name, trust_remote_code=True
|
||||
)
|
||||
print("Creating reduced configuration...")
|
||||
reduced_config = create_reduced_config(original_config, text_layers,
|
||||
num_experts, vision_layers)
|
||||
reduced_config = create_reduced_config(
|
||||
original_config, text_layers, num_experts, vision_layers
|
||||
)
|
||||
|
||||
config_path = output_path / "config.json"
|
||||
with open(config_path, "w") as f:
|
||||
@@ -149,8 +151,7 @@ def create_reduced_maverick_model(
|
||||
copy_tokenizer_files(original_model_name, output_path)
|
||||
|
||||
print("Creating reduced safetensors files...")
|
||||
create_reduced_safetensors(original_config, reduced_config,
|
||||
output_path)
|
||||
create_reduced_safetensors(original_config, reduced_config, output_path)
|
||||
|
||||
print("Creating preprocessor config...")
|
||||
create_preprocessor_config(original_config, output_path)
|
||||
@@ -173,9 +174,9 @@ def create_reduced_maverick_model(
|
||||
raise
|
||||
|
||||
|
||||
def create_reduced_config(original_config: Any, text_layers: int,
|
||||
num_experts: int,
|
||||
vision_layers: int) -> dict[str, Any]:
|
||||
def create_reduced_config(
|
||||
original_config: Any, text_layers: int, num_experts: int, vision_layers: int
|
||||
) -> dict[str, Any]:
|
||||
"""Create a reduced configuration based on the original."""
|
||||
|
||||
# Convert config to dictionary
|
||||
@@ -185,23 +186,18 @@ def create_reduced_config(original_config: Any, text_layers: int,
|
||||
if "text_config" in config_dict:
|
||||
original_text_layers = config_dict["text_config"]["num_hidden_layers"]
|
||||
config_dict["text_config"]["num_hidden_layers"] = text_layers
|
||||
print(
|
||||
f"Reduced text layers from {original_text_layers} to {text_layers}"
|
||||
)
|
||||
print(f"Reduced text layers from {original_text_layers} to {text_layers}")
|
||||
|
||||
original_num_experts = config_dict["text_config"]["num_local_experts"]
|
||||
config_dict["text_config"]["num_local_experts"] = num_experts
|
||||
print(
|
||||
f"Reduced num experts from {original_num_experts} to {num_experts}"
|
||||
)
|
||||
print(f"Reduced num experts from {original_num_experts} to {num_experts}")
|
||||
|
||||
hidden_dim_divisor = 4
|
||||
|
||||
original_hidden_size = config_dict["text_config"]["hidden_size"]
|
||||
new_hidden_size = original_hidden_size // hidden_dim_divisor
|
||||
config_dict["text_config"]["hidden_size"] = new_hidden_size
|
||||
print(f"Reduced hidden size from {original_hidden_size} to "
|
||||
f"{new_hidden_size}")
|
||||
print(f"Reduced hidden size from {original_hidden_size} to {new_hidden_size}")
|
||||
|
||||
original_head_dim = config_dict["text_config"]["head_dim"]
|
||||
new_head_dim = original_head_dim // hidden_dim_divisor
|
||||
@@ -210,15 +206,12 @@ def create_reduced_config(original_config: Any, text_layers: int,
|
||||
|
||||
# Reduce vision layers
|
||||
if "vision_config" in config_dict:
|
||||
original_vision_layers = config_dict["vision_config"][
|
||||
"num_hidden_layers"]
|
||||
original_vision_layers = config_dict["vision_config"]["num_hidden_layers"]
|
||||
config_dict["vision_config"]["num_hidden_layers"] = vision_layers
|
||||
print(f"Reduced vision layers from {original_vision_layers} "
|
||||
f"to {vision_layers}")
|
||||
print(f"Reduced vision layers from {original_vision_layers} to {vision_layers}")
|
||||
|
||||
# Update model name to indicate it's a reduced version
|
||||
config_dict["_name_or_path"] = (
|
||||
f"reduced_maverick_{text_layers}t_{vision_layers}v")
|
||||
config_dict["_name_or_path"] = f"reduced_maverick_{text_layers}t_{vision_layers}v"
|
||||
|
||||
return config_dict
|
||||
|
||||
@@ -227,16 +220,16 @@ def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
|
||||
"""Copy tokenizer files from the original model."""
|
||||
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(original_model_name,
|
||||
trust_remote_code=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
original_model_name, trust_remote_code=True
|
||||
)
|
||||
tokenizer.save_pretrained(output_path)
|
||||
print("Tokenizer files copied successfully")
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not copy tokenizer files: {e}")
|
||||
|
||||
|
||||
def create_preprocessor_config(original_config: Any,
|
||||
output_path: Path) -> None:
|
||||
def create_preprocessor_config(original_config: Any, output_path: Path) -> None:
|
||||
"""Create preprocessor_config.json for multimodal model."""
|
||||
|
||||
# Try to load the original preprocessor config
|
||||
@@ -254,9 +247,9 @@ def create_preprocessor_config(original_config: Any,
|
||||
raise
|
||||
|
||||
|
||||
def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
|
||||
Any],
|
||||
output_path: Path) -> None:
|
||||
def create_reduced_safetensors(
|
||||
original_config: Any, reduced_config: dict[str, Any], output_path: Path
|
||||
) -> None:
|
||||
"""Create safetensors files with weights for the reduced model."""
|
||||
|
||||
print("Generating synthetic weights for reduced model...")
|
||||
@@ -279,8 +272,7 @@ def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
|
||||
save_weights_to_safetensors(weights, output_path)
|
||||
|
||||
|
||||
def create_text_model_weights(
|
||||
text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
|
||||
def create_text_model_weights(text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
|
||||
"""Create synthetic weights for the text model with MoE structure."""
|
||||
|
||||
weights = {}
|
||||
@@ -291,19 +283,18 @@ def create_text_model_weights(
|
||||
intermediate_size_mlp = text_config["intermediate_size_mlp"]
|
||||
num_layers = text_config["num_hidden_layers"]
|
||||
num_attention_heads = text_config["num_attention_heads"]
|
||||
num_key_value_heads = text_config.get("num_key_value_heads",
|
||||
num_attention_heads)
|
||||
num_key_value_heads = text_config.get("num_key_value_heads", num_attention_heads)
|
||||
|
||||
# MoE specific parameters
|
||||
num_experts = text_config.get("num_local_experts")
|
||||
assert (num_experts
|
||||
is not None), "num_local_experts must be specified for MoE"
|
||||
assert num_experts is not None, "num_local_experts must be specified for MoE"
|
||||
|
||||
head_dim = hidden_size // num_attention_heads
|
||||
|
||||
# Embedding layers
|
||||
weights["language_model.model.embed_tokens.weight"] = torch.randn(
|
||||
vocab_size, hidden_size, dtype=torch.float16)
|
||||
vocab_size, hidden_size, dtype=torch.float16
|
||||
)
|
||||
|
||||
# Transformer layers
|
||||
for layer_idx in range(num_layers):
|
||||
@@ -312,95 +303,105 @@ def create_text_model_weights(
|
||||
|
||||
# Self-attention weights (separate q, k, v projections)
|
||||
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
|
||||
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
|
||||
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
|
||||
hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16)
|
||||
hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
|
||||
num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16)
|
||||
num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
|
||||
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
|
||||
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
|
||||
)
|
||||
print("Self-attention weights created.")
|
||||
|
||||
# Feed-forward weights - MoE pattern based on interleave_moe_layer_step
|
||||
# For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
|
||||
# 0,2,4,... are dense
|
||||
interleave_step = text_config.get("interleave_moe_layer_step", 1)
|
||||
is_moe_layer = (interleave_step > 0
|
||||
and (layer_idx + 1) % interleave_step == 0)
|
||||
is_moe_layer = interleave_step > 0 and (layer_idx + 1) % interleave_step == 0
|
||||
|
||||
if is_moe_layer:
|
||||
# MoE layer structure
|
||||
# 1. Router weights
|
||||
weights[
|
||||
f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
|
||||
num_experts, hidden_size, dtype=torch.float16)
|
||||
weights[f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
|
||||
num_experts, hidden_size, dtype=torch.float16
|
||||
)
|
||||
|
||||
# 2. Individual expert weights (not fused)
|
||||
for expert_idx in range(num_experts):
|
||||
expert_prefix = (
|
||||
f"{layer_prefix}.feed_forward.experts.{expert_idx}")
|
||||
expert_prefix = f"{layer_prefix}.feed_forward.experts.{expert_idx}"
|
||||
|
||||
weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16)
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16)
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16)
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
# Expert weight scales (FP8 quantization)
|
||||
weights[
|
||||
f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
|
||||
intermediate_size, 1, dtype=torch.bfloat16)
|
||||
weights[f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
|
||||
intermediate_size, 1, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
|
||||
intermediate_size, 1, dtype=torch.bfloat16)
|
||||
weights[
|
||||
f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
|
||||
hidden_size, 1, dtype=torch.bfloat16)
|
||||
intermediate_size, 1, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
|
||||
hidden_size, 1, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
# 3. Shared expert weights
|
||||
shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
|
||||
weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16)
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16)
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16)
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16
|
||||
)
|
||||
print(f"MoE feed-forward weights created for layer {layer_idx}.")
|
||||
else:
|
||||
# Dense layer structure
|
||||
weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = (
|
||||
torch.randn(intermediate_size_mlp,
|
||||
hidden_size,
|
||||
dtype=torch.bfloat16))
|
||||
weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = (
|
||||
torch.randn(intermediate_size_mlp,
|
||||
hidden_size,
|
||||
dtype=torch.bfloat16))
|
||||
weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = (
|
||||
torch.randn(hidden_size,
|
||||
intermediate_size_mlp,
|
||||
dtype=torch.bfloat16))
|
||||
weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = torch.randn(
|
||||
intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = torch.randn(
|
||||
intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size_mlp, dtype=torch.bfloat16
|
||||
)
|
||||
print(f"Dense feed-forward weights created for layer {layer_idx}.")
|
||||
|
||||
# Layer norms
|
||||
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
weights[
|
||||
f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
print("Layer norms created.")
|
||||
|
||||
# Final layer norm and output projection
|
||||
weights["language_model.model.norm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights["language_model.lm_head.weight"] = torch.randn(
|
||||
vocab_size, hidden_size, dtype=torch.bfloat16)
|
||||
vocab_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def create_vision_model_weights(
|
||||
vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
|
||||
vision_config: dict[str, Any],
|
||||
) -> dict[str, torch.Tensor]:
|
||||
"""Create synthetic weights for the vision model."""
|
||||
|
||||
weights = {}
|
||||
@@ -414,47 +415,62 @@ def create_vision_model_weights(
|
||||
layer_prefix = f"vision_model.model.layers.{layer_idx}"
|
||||
|
||||
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16)
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
|
||||
intermediate_size, dtype=torch.bfloat16)
|
||||
intermediate_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16)
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
weights[
|
||||
f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def create_shared_weights(
|
||||
text_config: dict[str, Any],
|
||||
vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
|
||||
text_config: dict[str, Any], vision_config: dict[str, Any]
|
||||
) -> dict[str, torch.Tensor]:
|
||||
"""Create weights for shared components (vision-language connector)"""
|
||||
|
||||
weights = {}
|
||||
@@ -464,13 +480,15 @@ def create_shared_weights(
|
||||
|
||||
# Vision-language connector (projects vision features to text space)
|
||||
weights["multi_modal_projector.linear_1.weight"] = torch.randn(
|
||||
text_hidden_size, projector_input_dim, dtype=torch.bfloat16)
|
||||
text_hidden_size, projector_input_dim, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
|
||||
output_path: Path) -> None:
|
||||
def save_weights_to_safetensors(
|
||||
weights: dict[str, torch.Tensor], output_path: Path
|
||||
) -> None:
|
||||
"""Save weights to safetensors files and create index."""
|
||||
|
||||
# Determine how to shard the weights
|
||||
@@ -507,18 +525,18 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
|
||||
else:
|
||||
# Multiple shards
|
||||
for i, shard in enumerate(shards):
|
||||
filename = f"model-{i+1:05d}-of-{len(shards):05d}.safetensors"
|
||||
filename = f"model-{i + 1:05d}-of-{len(shards):05d}.safetensors"
|
||||
save_file(shard, output_path / filename)
|
||||
for name in shard:
|
||||
weight_map[name] = filename
|
||||
print(f"Saved shard {i+1}/{len(shards)}: {filename}")
|
||||
print(f"Saved shard {i + 1}/{len(shards)}: {filename}")
|
||||
|
||||
# Create index file
|
||||
index_data = {
|
||||
"metadata": {
|
||||
"total_size":
|
||||
sum(tensor.numel() * tensor.element_size()
|
||||
for tensor in weights.values())
|
||||
"total_size": sum(
|
||||
tensor.numel() * tensor.element_size() for tensor in weights.values()
|
||||
)
|
||||
},
|
||||
"weight_map": weight_map,
|
||||
}
|
||||
@@ -528,8 +546,9 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
|
||||
json.dump(index_data, f, indent=2)
|
||||
|
||||
print(f"Created index file: {index_path}")
|
||||
print(f"Total model size: "
|
||||
f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
|
||||
print(
|
||||
f"Total model size: {index_data['metadata']['total_size'] / (1024**3):.2f} GB"
|
||||
)
|
||||
|
||||
|
||||
def check_attention_spec_interleaved_rope(
|
||||
@@ -540,8 +559,7 @@ def check_attention_spec_interleaved_rope(
|
||||
):
|
||||
"""Check that the attention spec is correct."""
|
||||
assert isinstance(llm.llm_engine.model_executor, Executor)
|
||||
kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs(
|
||||
)
|
||||
kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs()
|
||||
for rank in range(num_ranks):
|
||||
kv_cache_specs = kv_cache_specs_per_rank[rank]
|
||||
assert len(kv_cache_specs.keys()) == num_attention_layers
|
||||
@@ -551,16 +569,14 @@ def check_attention_spec_interleaved_rope(
|
||||
else:
|
||||
expected_spec = ChunkedLocalAttentionSpec
|
||||
assert isinstance(
|
||||
kv_cache_specs[
|
||||
f"language_model.model.layers.{i}.self_attn.attn"],
|
||||
expected_spec)
|
||||
kv_cache_specs[f"language_model.model.layers.{i}.self_attn.attn"],
|
||||
expected_spec,
|
||||
)
|
||||
|
||||
|
||||
def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
|
||||
"""Test the created reduced model with vLLM."""
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=50)
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50)
|
||||
|
||||
if should_profile:
|
||||
llm.start_profile()
|
||||
@@ -571,15 +587,15 @@ def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
|
||||
print("Test generation successful!")
|
||||
for output in outputs:
|
||||
print(f"Prompt: {output.prompt}")
|
||||
print(f"Output: "
|
||||
f"{output.outputs[0].text}")
|
||||
print(f"Output: {output.outputs[0].text}")
|
||||
print("-" * 40)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"original_model_name,text_layers,num_experts,vision_layers,",
|
||||
[("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)])
|
||||
[("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)],
|
||||
)
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
@pytest.mark.parametrize("tp,ep", [(2, True)])
|
||||
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
|
||||
@@ -640,7 +656,8 @@ def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Create a reduced-layer Maverick model")
|
||||
description="Create a reduced-layer Maverick model"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default="/tmp/reduced_maverick",
|
||||
@@ -652,10 +669,7 @@ def main():
|
||||
default=4,
|
||||
help="Number of text transformer layers",
|
||||
)
|
||||
parser.add_argument("--num-experts",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Number of experts")
|
||||
parser.add_argument("--num-experts", type=int, default=4, help="Number of experts")
|
||||
parser.add_argument(
|
||||
"--vision-layers",
|
||||
type=int,
|
||||
@@ -667,12 +681,12 @@ def main():
|
||||
action="store_true",
|
||||
help="Force recreation if output directory exists",
|
||||
)
|
||||
parser.add_argument("--test",
|
||||
action="store_true",
|
||||
help="Test the created model with vLLM")
|
||||
parser.add_argument("--profile",
|
||||
action="store_true",
|
||||
help="Profile the created model with vLLM")
|
||||
parser.add_argument(
|
||||
"--test", action="store_true", help="Test the created model with vLLM"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profile", action="store_true", help="Profile the created model with vLLM"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test-original",
|
||||
action="store_true",
|
||||
@@ -687,16 +701,18 @@ def main():
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.test:
|
||||
test_dummy_maverick(original_model_name=args.original_model,
|
||||
output_dir=args.output_dir,
|
||||
text_layers=args.text_layers,
|
||||
num_experts=args.num_experts,
|
||||
vision_layers=args.vision_layers,
|
||||
force_recreate=args.force_recreate,
|
||||
tp=2,
|
||||
ep=True,
|
||||
enforce_eager=True,
|
||||
profile=args.profile)
|
||||
test_dummy_maverick(
|
||||
original_model_name=args.original_model,
|
||||
output_dir=args.output_dir,
|
||||
text_layers=args.text_layers,
|
||||
num_experts=args.num_experts,
|
||||
vision_layers=args.vision_layers,
|
||||
force_recreate=args.force_recreate,
|
||||
tp=2,
|
||||
ep=True,
|
||||
enforce_eager=True,
|
||||
profile=args.profile,
|
||||
)
|
||||
|
||||
if args.test_original:
|
||||
run_maverick_serving(args.original_model)
|
||||
|
||||
@@ -14,26 +14,35 @@ from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
|
||||
PromptImageInput, VllmRunner)
|
||||
from ....conftest import (
|
||||
IMAGE_ASSETS,
|
||||
HfRunner,
|
||||
PromptAudioInput,
|
||||
PromptImageInput,
|
||||
VllmRunner,
|
||||
)
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
"cherry_blossom":
|
||||
"<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
})
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
"cherry_blossom": "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
}
|
||||
)
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = (
|
||||
"<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
|
||||
)
|
||||
|
||||
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct",
|
||||
revision="refs/pr/70")
|
||||
model_path = snapshot_download(
|
||||
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
|
||||
)
|
||||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||
# we have to manually specify the path of the lora weights.
|
||||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||
speech_question = os.path.join(model_path, "examples",
|
||||
"what_is_shown_in_this_image.wav")
|
||||
speech_question = os.path.join(
|
||||
model_path, "examples", "what_is_shown_in_this_image.wav"
|
||||
)
|
||||
models = [model_path]
|
||||
|
||||
target_dtype = "half"
|
||||
@@ -48,8 +57,7 @@ if current_platform.is_rocm():
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: Sequence[tuple[list[str], PromptImageInput,
|
||||
Optional[PromptAudioInput]]],
|
||||
inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]],
|
||||
model: str,
|
||||
*,
|
||||
max_model_len: int,
|
||||
@@ -75,28 +83,30 @@ def run_test(
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
task="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
|
||||
enforce_eager=True,
|
||||
trust_remote_code=False,
|
||||
model,
|
||||
task="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
|
||||
enforce_eager=True,
|
||||
trust_remote_code=False,
|
||||
) as vllm_model:
|
||||
lora_request = LoRARequest("vision", 1, vision_lora_path)
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
lora_request=lora_request)
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
@@ -108,17 +118,18 @@ def run_test(
|
||||
hf_processor = hf_model.processor
|
||||
eos_token_id = hf_processor.tokenizer.eos_token_id
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
eos_token_id=eos_token_id)
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
eos_token_id=eos_token_id,
|
||||
)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
|
||||
vllm_outputs_per_case):
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
@@ -145,16 +156,27 @@ def run_test(
|
||||
@pytest.mark.parametrize("max_model_len", [12800])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
dtype: str, max_model_len: int, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
None,
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
inputs_per_image = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
None,
|
||||
)
|
||||
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
@@ -189,16 +211,26 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
@pytest.mark.parametrize("max_model_len", [25600])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||
size_factors, dtype: str, max_model_len: int,
|
||||
max_tokens: int, num_logprobs: int) -> None:
|
||||
def test_multi_images_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case = [
|
||||
(
|
||||
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||
[[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors],
|
||||
[
|
||||
[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors
|
||||
],
|
||||
None,
|
||||
),
|
||||
]
|
||||
@@ -222,10 +254,15 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||
@pytest.mark.parametrize("max_model_len", [12800])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
|
||||
max_model_len: int, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
|
||||
def test_vision_speech_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
# use the example speech question so that the model outputs are reasonable
|
||||
audio = librosa.load(speech_question, sr=16000)
|
||||
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
||||
|
||||
@@ -17,31 +17,39 @@ from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.image import convert_image_mode, rescale_image_size
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
|
||||
PromptImageInput, VllmRunner)
|
||||
from ....conftest import (
|
||||
IMAGE_ASSETS,
|
||||
HfRunner,
|
||||
PromptAudioInput,
|
||||
PromptImageInput,
|
||||
VllmRunner,
|
||||
)
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
"cherry_blossom":
|
||||
"<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
})
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
"cherry_blossom": "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
}
|
||||
)
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = (
|
||||
"<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
|
||||
)
|
||||
|
||||
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
|
||||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||
# we have to manually specify the path of the lora weights.
|
||||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||
speech_question = os.path.join(model_path, "examples",
|
||||
"what_is_shown_in_this_image.wav")
|
||||
speech_question = os.path.join(
|
||||
model_path, "examples", "what_is_shown_in_this_image.wav"
|
||||
)
|
||||
models = [model_path]
|
||||
|
||||
|
||||
def vllm_to_hf_output(vllm_output: tuple[list[int], str,
|
||||
Optional[SampleLogprobs]],
|
||||
model: str):
|
||||
def vllm_to_hf_output(
|
||||
vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str
|
||||
):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -71,8 +79,7 @@ if current_platform.is_rocm():
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: Sequence[tuple[list[str], PromptImageInput,
|
||||
Optional[PromptAudioInput]]],
|
||||
inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]],
|
||||
model: str,
|
||||
*,
|
||||
max_model_len: int,
|
||||
@@ -98,27 +105,29 @@ def run_test(
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
|
||||
enforce_eager=True,
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
lora_request = LoRARequest("vision", 1, vision_lora_path)
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
lora_request=lora_request)
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
@@ -127,42 +136,36 @@ def run_test(
|
||||
pytest.skip("HF impl is not compatible with current transformers")
|
||||
|
||||
hf_model_kwargs = {"_attn_implementation": "sdpa"}
|
||||
with hf_runner(model, dtype=dtype,
|
||||
model_kwargs=hf_model_kwargs) as hf_model:
|
||||
|
||||
with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
|
||||
hf_processor = hf_model.processor
|
||||
eos_token_id = hf_processor.tokenizer.eos_token_id
|
||||
|
||||
def patch_hf_processor(*args,
|
||||
text="",
|
||||
images=None,
|
||||
audio=None,
|
||||
sampling_rate=None,
|
||||
**kwargs):
|
||||
def patch_hf_processor(
|
||||
*args, text="", images=None, audio=None, sampling_rate=None, **kwargs
|
||||
):
|
||||
audios = None
|
||||
if audio is not None and sampling_rate is not None:
|
||||
audios = [(audio, sampling_rate)]
|
||||
return hf_processor(*args,
|
||||
text=text,
|
||||
images=images,
|
||||
audios=audios,
|
||||
**kwargs)
|
||||
return hf_processor(
|
||||
*args, text=text, images=images, audios=audios, **kwargs
|
||||
)
|
||||
|
||||
hf_model.processor = patch_hf_processor
|
||||
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
eos_token_id=eos_token_id,
|
||||
num_logits_to_keep=0)
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
eos_token_id=eos_token_id,
|
||||
num_logits_to_keep=0,
|
||||
)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
|
||||
vllm_outputs_per_case):
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
@@ -189,16 +192,27 @@ def run_test(
|
||||
@pytest.mark.parametrize("max_model_len", [12800])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
dtype: str, max_model_len: int, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
None,
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
inputs_per_image = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
None,
|
||||
)
|
||||
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
@@ -233,16 +247,26 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
@pytest.mark.parametrize("max_model_len", [25600])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||
size_factors, dtype: str, max_model_len: int,
|
||||
max_tokens: int, num_logprobs: int) -> None:
|
||||
def test_multi_images_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case = [
|
||||
(
|
||||
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||
[[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors],
|
||||
[
|
||||
[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors
|
||||
],
|
||||
None,
|
||||
),
|
||||
]
|
||||
@@ -266,10 +290,15 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||
@pytest.mark.parametrize("max_model_len", [12800])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
|
||||
max_model_len: int, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
|
||||
def test_vision_speech_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
# use the example speech question so that the model outputs are reasonable
|
||||
audio = librosa.load(speech_question, sr=None)
|
||||
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
|
||||
|
||||
@@ -37,33 +37,33 @@ PROMPT = "Describe each image in one short sentence."
|
||||
|
||||
|
||||
def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
|
||||
return [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": PROMPT,
|
||||
}] + [{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": url
|
||||
}
|
||||
} for url in urls],
|
||||
}]
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": PROMPT,
|
||||
}
|
||||
]
|
||||
+ [{"type": "image_url", "image_url": {"url": url}} for url in urls],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
|
||||
return [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"content": PROMPT,
|
||||
}, *({
|
||||
"type": "image",
|
||||
"image": download_image(url)
|
||||
} for url in urls)],
|
||||
}]
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"content": PROMPT,
|
||||
},
|
||||
*({"type": "image", "image": download_image(url)} for url in urls),
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
|
||||
@@ -125,11 +125,17 @@ def _dump_outputs_w_logprobs(
|
||||
outputs: OutputsLogprobs,
|
||||
filename: "StrPath",
|
||||
) -> None:
|
||||
json_data = [(tokens, text, [{
|
||||
k: asdict(v)
|
||||
for k, v in token_logprobs.items()
|
||||
} for token_logprobs in (logprobs or [])])
|
||||
for tokens, text, logprobs in outputs]
|
||||
json_data = [
|
||||
(
|
||||
tokens,
|
||||
text,
|
||||
[
|
||||
{k: asdict(v) for k, v in token_logprobs.items()}
|
||||
for token_logprobs in (logprobs or [])
|
||||
],
|
||||
)
|
||||
for tokens, text, logprobs in outputs
|
||||
]
|
||||
|
||||
with open(filename, "w") as f:
|
||||
json.dump(json_data, f)
|
||||
@@ -139,28 +145,35 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
|
||||
with open(filename, "rb") as f:
|
||||
json_data = json.load(f)
|
||||
|
||||
return [(tokens, text, [{
|
||||
int(k): Logprob(**v)
|
||||
for k, v in token_logprobs.items()
|
||||
} for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
|
||||
return [
|
||||
(
|
||||
tokens,
|
||||
text,
|
||||
[
|
||||
{int(k): Logprob(**v) for k, v in token_logprobs.items()}
|
||||
for token_logprobs in logprobs
|
||||
],
|
||||
)
|
||||
for tokens, text, logprobs in json_data
|
||||
]
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=80)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
|
||||
local_asset_server) -> None:
|
||||
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
|
||||
FIXTURE_LOGPROBS_CHAT[model])
|
||||
def test_chat(
|
||||
vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
|
||||
) -> None:
|
||||
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
load_format="mistral",
|
||||
config_format="mistral",
|
||||
max_model_len=max_model_len,
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
load_format="mistral",
|
||||
config_format="mistral",
|
||||
max_model_len=max_model_len,
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
) as vllm_model:
|
||||
outputs = []
|
||||
|
||||
@@ -180,7 +193,9 @@ def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
|
||||
for i in range(len(logprobs)):
|
||||
assert logprobs[i][-1] is None
|
||||
logprobs[i] = logprobs[i][:-1]
|
||||
check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
|
||||
outputs_1_lst=logprobs,
|
||||
name_0="h100_ref",
|
||||
name_1="output")
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
|
||||
outputs_1_lst=logprobs,
|
||||
name_0="h100_ref",
|
||||
name_1="output",
|
||||
)
|
||||
|
||||
@@ -17,14 +17,15 @@ def qwen2_5_vl_chat_template(*query):
|
||||
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||
|
||||
|
||||
VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
|
||||
"baby_reading":
|
||||
qwen2_5_vl_chat_template(
|
||||
VIDEO_PLACEHOLDER,
|
||||
"Describe this video with a short sentence ",
|
||||
"(no more than 20 words)",
|
||||
),
|
||||
})
|
||||
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
|
||||
{
|
||||
"baby_reading": qwen2_5_vl_chat_template(
|
||||
VIDEO_PLACEHOLDER,
|
||||
"Describe this video with a short sentence ",
|
||||
"(no more than 20 words)",
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@@ -33,10 +34,15 @@ VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
|
||||
@pytest.mark.parametrize("num_frames", [16])
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
|
||||
video_pruning_rate: float,
|
||||
num_frames: int, dtype: str,
|
||||
max_tokens: int) -> None:
|
||||
def test_qwen2_5_vl_evs_functionality(
|
||||
vllm_runner,
|
||||
video_assets,
|
||||
model,
|
||||
video_pruning_rate: float,
|
||||
num_frames: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
"""Test EVS (Efficient Video Sampling) functionality with different
|
||||
pruning rates.
|
||||
"""
|
||||
@@ -51,19 +57,18 @@ def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
|
||||
videos = [sampled_vids[0]]
|
||||
|
||||
# Initialize model with EVS configuration
|
||||
with vllm_runner(model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=1,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
tensor_parallel_size=1,
|
||||
video_pruning_rate=video_pruning_rate) as vllm_model:
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=1,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
tensor_parallel_size=1,
|
||||
video_pruning_rate=video_pruning_rate,
|
||||
) as vllm_model:
|
||||
# Generate output - this should not crash
|
||||
outputs = vllm_model.generate_greedy(prompts,
|
||||
max_tokens,
|
||||
videos=videos)
|
||||
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
|
||||
|
||||
# Basic validation that we got a response
|
||||
assert len(outputs) == 1
|
||||
@@ -83,10 +88,15 @@ def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
|
||||
@pytest.mark.parametrize("num_frames", [16])
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
|
||||
video_pruning_rate: float,
|
||||
num_frames: int, dtype: str,
|
||||
max_tokens: int) -> None:
|
||||
def test_qwen2_5_vl_evs_batched_videos(
|
||||
vllm_runner,
|
||||
video_assets,
|
||||
model,
|
||||
video_pruning_rate: float,
|
||||
num_frames: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
"""Test EVS functionality with batched videos.
|
||||
|
||||
This test validates that:
|
||||
@@ -102,23 +112,21 @@ def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
|
||||
|
||||
# Test batched videos
|
||||
prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
|
||||
videos = [sampled_vids[0],
|
||||
sampled_vids[0]] # Use same video twice for testing
|
||||
videos = [sampled_vids[0], sampled_vids[0]] # Use same video twice for testing
|
||||
|
||||
# Initialize model with EVS configuration
|
||||
with vllm_runner(model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"video": 2},
|
||||
tensor_parallel_size=1,
|
||||
video_pruning_rate=video_pruning_rate) as vllm_model:
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"video": 2},
|
||||
tensor_parallel_size=1,
|
||||
video_pruning_rate=video_pruning_rate,
|
||||
) as vllm_model:
|
||||
# Generate output - this should not crash
|
||||
outputs = vllm_model.generate_greedy(prompts,
|
||||
max_tokens,
|
||||
videos=videos)
|
||||
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
|
||||
|
||||
# Basic validation that we got responses for both videos
|
||||
assert len(outputs) == 2
|
||||
|
||||
@@ -11,8 +11,13 @@ from PIL import Image
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
|
||||
PromptVideoInput, VllmRunner)
|
||||
from ....conftest import (
|
||||
IMAGE_ASSETS,
|
||||
VIDEO_ASSETS,
|
||||
PromptImageInput,
|
||||
PromptVideoInput,
|
||||
VllmRunner,
|
||||
)
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
@@ -34,28 +39,29 @@ def qwen2_vl_chat_template(*query):
|
||||
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||
|
||||
|
||||
IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
"What is the biggest text's content in this image?",
|
||||
),
|
||||
"cherry_blossom":
|
||||
qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
"What is the season shown in this image? ",
|
||||
"Reply with a short sentence (no more than 20 words)",
|
||||
),
|
||||
})
|
||||
IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
"What is the biggest text's content in this image?",
|
||||
),
|
||||
"cherry_blossom": qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
"What is the season shown in this image? ",
|
||||
"Reply with a short sentence (no more than 20 words)",
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
|
||||
"baby_reading":
|
||||
qwen2_vl_chat_template(
|
||||
VIDEO_PLACEHOLDER,
|
||||
"Describe this video with a short sentence ",
|
||||
"(no more than 20 words)",
|
||||
),
|
||||
})
|
||||
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
|
||||
{
|
||||
"baby_reading": qwen2_vl_chat_template(
|
||||
VIDEO_PLACEHOLDER,
|
||||
"Describe this video with a short sentence ",
|
||||
"(no more than 20 words)",
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
@@ -77,17 +83,19 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
|
||||
|
||||
|
||||
def batch_make_image_embeddings(
|
||||
image_batches: list[Union[Image.Image, list[Image.Image]]], processor,
|
||||
llm: VllmRunner) -> list[Qwen2VLPromptImageEmbeddingInput]:
|
||||
image_batches: list[Union[Image.Image, list[Image.Image]]],
|
||||
processor,
|
||||
llm: VllmRunner,
|
||||
) -> list[Qwen2VLPromptImageEmbeddingInput]:
|
||||
"""batched image embeddings for Qwen2-VL
|
||||
|
||||
This will infer all images' embeddings in a single batch,
|
||||
This will infer all images' embeddings in a single batch,
|
||||
and split the result according to input batches.
|
||||
|
||||
image_batches:
|
||||
- Single-image batches: `list[Image.Image]`
|
||||
- Multiple-image batches: `list[list[Image.Image]]]`
|
||||
|
||||
|
||||
returns: `list[Qwen2VLPromptImageEmbeddingInput]`
|
||||
"""
|
||||
|
||||
@@ -108,9 +116,9 @@ def batch_make_image_embeddings(
|
||||
# image to pixel values
|
||||
image_processor = processor.image_processor
|
||||
|
||||
preprocess_result = image_processor \
|
||||
.preprocess(images=images, return_tensors="pt") \
|
||||
.data
|
||||
preprocess_result = image_processor.preprocess(
|
||||
images=images, return_tensors="pt"
|
||||
).data
|
||||
pixel_values = preprocess_result["pixel_values"]
|
||||
image_grid_thw = preprocess_result["image_grid_thw"]
|
||||
|
||||
@@ -119,12 +127,13 @@ def batch_make_image_embeddings(
|
||||
with torch.no_grad():
|
||||
visual = model.visual
|
||||
|
||||
pixel_values_on_device = pixel_values.to(visual.device,
|
||||
dtype=visual.dtype)
|
||||
image_grid_thw_on_device = image_grid_thw.to(visual.device,
|
||||
dtype=torch.int64)
|
||||
return visual(pixel_values_on_device,
|
||||
grid_thw=image_grid_thw_on_device).cpu()
|
||||
pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
|
||||
image_grid_thw_on_device = image_grid_thw.to(
|
||||
visual.device, dtype=torch.int64
|
||||
)
|
||||
return visual(
|
||||
pixel_values_on_device, grid_thw=image_grid_thw_on_device
|
||||
).cpu()
|
||||
|
||||
image_embeds = torch.concat(llm.apply_model(get_image_embeds))
|
||||
|
||||
@@ -137,16 +146,21 @@ def batch_make_image_embeddings(
|
||||
merge_size = image_processor.merge_size
|
||||
cur_batch_embed_len = sum(
|
||||
grid_thw.prod(-1) // merge_size // merge_size
|
||||
for grid_thw in image_grid_thw[image_counter:image_counter +
|
||||
cur_batch_image_count])
|
||||
for grid_thw in image_grid_thw[
|
||||
image_counter : image_counter + cur_batch_image_count
|
||||
]
|
||||
)
|
||||
|
||||
result.append({
|
||||
"image_embeds":
|
||||
image_embeds[embed_counter:embed_counter + cur_batch_embed_len],
|
||||
"image_grid_thw":
|
||||
image_grid_thw[image_counter:image_counter +
|
||||
cur_batch_image_count],
|
||||
})
|
||||
result.append(
|
||||
{
|
||||
"image_embeds": image_embeds[
|
||||
embed_counter : embed_counter + cur_batch_embed_len
|
||||
],
|
||||
"image_grid_thw": image_grid_thw[
|
||||
image_counter : image_counter + cur_batch_image_count
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
embed_counter += cur_batch_embed_len
|
||||
image_counter += cur_batch_image_count
|
||||
@@ -160,13 +174,13 @@ def batch_make_image_embeddings(
|
||||
|
||||
|
||||
def batch_make_video_embeddings(
|
||||
video_batches: PromptVideoInput, processor,
|
||||
llm: VllmRunner) -> list[Qwen2VLPromptVideoEmbeddingInput]:
|
||||
video_batches: PromptVideoInput, processor, llm: VllmRunner
|
||||
) -> list[Qwen2VLPromptVideoEmbeddingInput]:
|
||||
"""batched video embeddings for Qwen2-VL
|
||||
|
||||
A NDArray represents a single video's all frames.
|
||||
|
||||
This will infer all videos' embeddings in a single batch,
|
||||
This will infer all videos' embeddings in a single batch,
|
||||
and split the result according to input batches.
|
||||
|
||||
video_batches:
|
||||
@@ -191,9 +205,9 @@ def batch_make_video_embeddings(
|
||||
# video to pixel values
|
||||
image_processor = processor.image_processor
|
||||
|
||||
preprocess_result = image_processor \
|
||||
.preprocess(images=None, videos=videos, return_tensors="pt") \
|
||||
.data
|
||||
preprocess_result = image_processor.preprocess(
|
||||
images=None, videos=videos, return_tensors="pt"
|
||||
).data
|
||||
pixel_values = preprocess_result["pixel_values_videos"]
|
||||
video_grid_thw = preprocess_result["video_grid_thw"]
|
||||
|
||||
@@ -202,12 +216,13 @@ def batch_make_video_embeddings(
|
||||
with torch.no_grad():
|
||||
visual = model.visual
|
||||
|
||||
pixel_values_on_device = pixel_values.to(visual.device,
|
||||
dtype=visual.dtype)
|
||||
video_grid_thw_on_device = video_grid_thw.to(visual.device,
|
||||
dtype=torch.int64)
|
||||
return visual(pixel_values_on_device,
|
||||
grid_thw=video_grid_thw_on_device).cpu()
|
||||
pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
|
||||
video_grid_thw_on_device = video_grid_thw.to(
|
||||
visual.device, dtype=torch.int64
|
||||
)
|
||||
return visual(
|
||||
pixel_values_on_device, grid_thw=video_grid_thw_on_device
|
||||
).cpu()
|
||||
|
||||
video_embeds = torch.concat(llm.apply_model(get_image_embeds))
|
||||
|
||||
@@ -220,16 +235,21 @@ def batch_make_video_embeddings(
|
||||
merge_size = image_processor.merge_size
|
||||
cur_batch_embed_len = sum(
|
||||
grid_thw.prod(-1) // merge_size // merge_size
|
||||
for grid_thw in video_grid_thw[video_counter:video_counter +
|
||||
cur_batch_video_count])
|
||||
for grid_thw in video_grid_thw[
|
||||
video_counter : video_counter + cur_batch_video_count
|
||||
]
|
||||
)
|
||||
|
||||
result.append({
|
||||
"video_embeds":
|
||||
video_embeds[embed_counter:embed_counter + cur_batch_embed_len],
|
||||
"video_grid_thw":
|
||||
video_grid_thw[video_counter:video_counter +
|
||||
cur_batch_video_count],
|
||||
})
|
||||
result.append(
|
||||
{
|
||||
"video_embeds": video_embeds[
|
||||
embed_counter : embed_counter + cur_batch_embed_len
|
||||
],
|
||||
"video_grid_thw": video_grid_thw[
|
||||
video_counter : video_counter + cur_batch_video_count
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
embed_counter += cur_batch_embed_len
|
||||
video_counter += cur_batch_video_count
|
||||
@@ -263,25 +283,24 @@ def run_embedding_input_test(
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=3,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={
|
||||
"image": mm_limit,
|
||||
"video": mm_limit
|
||||
},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
default_torch_num_threads=1,
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=3,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit, "video": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
outputs_per_case_for_original_input = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images or None,
|
||||
videos=videos or None)
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images or None,
|
||||
videos=videos or None,
|
||||
)
|
||||
for prompts, images, videos in inputs
|
||||
]
|
||||
|
||||
@@ -290,17 +309,19 @@ def run_embedding_input_test(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=batch_make_image_embeddings(
|
||||
images, processor, vllm_model) if images else None,
|
||||
videos=batch_make_video_embeddings(
|
||||
videos, processor, vllm_model) if videos else None)
|
||||
images=batch_make_image_embeddings(images, processor, vllm_model)
|
||||
if images
|
||||
else None,
|
||||
videos=batch_make_video_embeddings(videos, processor, vllm_model)
|
||||
if videos
|
||||
else None,
|
||||
)
|
||||
for prompts, images, videos in inputs
|
||||
]
|
||||
|
||||
for outputs_for_original_input, \
|
||||
outputs_for_embeddings_input \
|
||||
in zip(outputs_per_case_for_original_input,
|
||||
outputs_per_case_for_embeddings_input):
|
||||
for outputs_for_original_input, outputs_for_embeddings_input in zip(
|
||||
outputs_per_case_for_original_input, outputs_per_case_for_embeddings_input
|
||||
):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=outputs_for_original_input,
|
||||
outputs_1_lst=outputs_for_embeddings_input,
|
||||
@@ -325,17 +346,26 @@ def run_embedding_input_test(
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
|
||||
size_factors, dtype, max_tokens,
|
||||
num_logprobs, monkeypatch) -> None:
|
||||
def test_qwen2_vl_image_embeddings_input(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case: list[tuple[
|
||||
list[str], PromptImageInput, PromptVideoInput]] = [(
|
||||
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
[],
|
||||
) for image, prompt in zip(images, IMAGE_PROMPTS)]
|
||||
)
|
||||
for image, prompt in zip(images, IMAGE_PROMPTS)
|
||||
]
|
||||
|
||||
run_embedding_input_test(
|
||||
vllm_runner,
|
||||
@@ -366,21 +396,27 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
|
||||
model, size_factors,
|
||||
dtype: str, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
def test_qwen2_vl_multiple_image_embeddings_input(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case: list[tuple[list[str], PromptImageInput,
|
||||
PromptVideoInput]] = [(
|
||||
[MULTIIMAGE_PROMPT for _ in size_factors],
|
||||
[[
|
||||
rescale_image_size(image, factor)
|
||||
for image in images
|
||||
] for factor in size_factors],
|
||||
[],
|
||||
)]
|
||||
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
|
||||
(
|
||||
[MULTIIMAGE_PROMPT for _ in size_factors],
|
||||
[
|
||||
[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors
|
||||
],
|
||||
[],
|
||||
)
|
||||
]
|
||||
|
||||
run_embedding_input_test(
|
||||
vllm_runner,
|
||||
@@ -410,22 +446,29 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
|
||||
size_factors, dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
def test_qwen2_vl_video_embeddings_input(
|
||||
vllm_runner,
|
||||
video_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
num_frames = 4
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
inputs_per_case: list[tuple[
|
||||
list[str], PromptImageInput, PromptVideoInput]] = [(
|
||||
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[],
|
||||
[rescale_video_size(video, factor) for factor in size_factors],
|
||||
) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)]
|
||||
)
|
||||
for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)
|
||||
]
|
||||
|
||||
run_embedding_input_test(
|
||||
vllm_runner,
|
||||
|
||||
@@ -15,12 +15,12 @@ from ...registry import HF_EXAMPLE_MODELS
|
||||
|
||||
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
|
||||
|
||||
AUDIO_PROMPTS = AUDIO_ASSETS.prompts({
|
||||
"mary_had_lamb":
|
||||
"Transcribe this into English.",
|
||||
"winning_call":
|
||||
"What is happening in this audio clip?",
|
||||
})
|
||||
AUDIO_PROMPTS = AUDIO_ASSETS.prompts(
|
||||
{
|
||||
"mary_had_lamb": "Transcribe this into English.",
|
||||
"winning_call": "What is happening in this audio clip?",
|
||||
}
|
||||
)
|
||||
|
||||
MULTI_AUDIO_PROMPT = "Describe each of the audios above."
|
||||
|
||||
@@ -33,7 +33,7 @@ CHUNKED_PREFILL_KWARGS = {
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_seqs": 2,
|
||||
# Use a very small limit to exercise chunked prefill.
|
||||
"max_num_batched_tokens": 16
|
||||
"max_num_batched_tokens": 16,
|
||||
}
|
||||
|
||||
|
||||
@@ -43,27 +43,33 @@ def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
|
||||
for key, value in params_kwargs.items():
|
||||
if isinstance(value, bool):
|
||||
if value:
|
||||
args.append(f"--{key.replace('_','-')}")
|
||||
args.append(f"--{key.replace('_', '-')}")
|
||||
else:
|
||||
args.append(f"--{key.replace('_','-')}={value}")
|
||||
args.append(f"--{key.replace('_', '-')}={value}")
|
||||
return args
|
||||
|
||||
|
||||
@pytest.fixture(params=[
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
])
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
]
|
||||
)
|
||||
def server(request, audio_assets: AudioTestAssets):
|
||||
args = [
|
||||
"--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"4096",
|
||||
"--enforce-eager",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
|
||||
json.dumps({"audio": len(audio_assets)}),
|
||||
"--trust-remote-code",
|
||||
] + params_kwargs_to_cli_args(request.param)
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME,
|
||||
args,
|
||||
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
|
||||
"30"}) as remote_server:
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@@ -77,12 +83,11 @@ def _get_prompt(audio_count, question, placeholder):
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
placeholder = f"{placeholder}\n" * audio_count
|
||||
|
||||
return tokenizer.apply_chat_template([{
|
||||
'role': 'user',
|
||||
'content': f"{placeholder}{question}"
|
||||
}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
return tokenizer.apply_chat_template(
|
||||
[{"role": "user", "content": f"{placeholder}{question}"}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
|
||||
def run_multi_audio_test(
|
||||
@@ -99,19 +104,21 @@ def run_multi_audio_test(
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={
|
||||
"audio":
|
||||
max((len(audio) for _, audio in prompts_and_audios))
|
||||
},
|
||||
**kwargs) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={
|
||||
"audio": max((len(audio) for _, audio in prompts_and_audios))
|
||||
},
|
||||
**kwargs,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
[prompt for prompt, _ in prompts_and_audios],
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=[audios for _, audios in prompts_and_audios])
|
||||
audios=[audios for _, audios in prompts_and_audios],
|
||||
)
|
||||
|
||||
# The HuggingFace model doesn't support multiple audios yet, so
|
||||
# just assert that some tokens were generated.
|
||||
@@ -122,21 +129,25 @@ def run_multi_audio_test(
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("vllm_kwargs", [
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
])
|
||||
def test_models_with_multiple_audios(vllm_runner,
|
||||
audio_assets: AudioTestAssets, dtype: str,
|
||||
max_tokens: int, num_logprobs: int,
|
||||
vllm_kwargs: dict) -> None:
|
||||
|
||||
vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT,
|
||||
VLLM_PLACEHOLDER)
|
||||
@pytest.mark.parametrize(
|
||||
"vllm_kwargs",
|
||||
[
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
],
|
||||
)
|
||||
def test_models_with_multiple_audios(
|
||||
vllm_runner,
|
||||
audio_assets: AudioTestAssets,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
vllm_kwargs: dict,
|
||||
) -> None:
|
||||
vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT, VLLM_PLACEHOLDER)
|
||||
run_multi_audio_test(
|
||||
vllm_runner,
|
||||
[(vllm_prompt, [audio.audio_and_sample_rate
|
||||
for audio in audio_assets])],
|
||||
[(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
|
||||
MODEL_NAME,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
@@ -149,28 +160,25 @@ def test_models_with_multiple_audios(vllm_runner,
|
||||
async def test_online_serving(client, audio_assets: AudioTestAssets):
|
||||
"""Exercises online serving with/without chunked prefill enabled."""
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*[{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio.url
|
||||
}
|
||||
} for audio in audio_assets],
|
||||
{
|
||||
"type":
|
||||
"text",
|
||||
"text":
|
||||
f"What's happening in these {len(audio_assets)} audio clips?"
|
||||
},
|
||||
],
|
||||
}]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*[
|
||||
{"type": "audio_url", "audio_url": {"url": audio.url}}
|
||||
for audio in audio_assets
|
||||
],
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"What's happening in these {len(audio_assets)} audio clips?",
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=10)
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME, messages=messages, max_tokens=10
|
||||
)
|
||||
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
|
||||
@@ -6,8 +6,12 @@ import json
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from mistral_common.audio import Audio
|
||||
from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
|
||||
TextChunk, UserMessage)
|
||||
from mistral_common.protocol.instruct.messages import (
|
||||
AudioChunk,
|
||||
RawAudio,
|
||||
TextChunk,
|
||||
UserMessage,
|
||||
)
|
||||
|
||||
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
||||
|
||||
@@ -17,8 +21,12 @@ from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
|
||||
|
||||
MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
|
||||
MISTRAL_FORMAT_ARGS = [
|
||||
"--tokenizer_mode", "mistral", "--config_format", "mistral",
|
||||
"--load_format", "mistral"
|
||||
"--tokenizer_mode",
|
||||
"mistral",
|
||||
"--config_format",
|
||||
"mistral",
|
||||
"--load_format",
|
||||
"mistral",
|
||||
]
|
||||
|
||||
|
||||
@@ -30,10 +38,9 @@ def server(request, audio_assets: AudioTestAssets):
|
||||
json.dumps({"audio": len(audio_assets)}),
|
||||
] + MISTRAL_FORMAT_ARGS
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME,
|
||||
args,
|
||||
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
|
||||
"30"}) as remote_server:
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@@ -64,15 +71,17 @@ def _get_prompt(audio_assets, question):
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models_with_multiple_audios(vllm_runner,
|
||||
audio_assets: AudioTestAssets, dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
def test_models_with_multiple_audios(
|
||||
vllm_runner,
|
||||
audio_assets: AudioTestAssets,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
|
||||
run_multi_audio_test(
|
||||
vllm_runner,
|
||||
[(vllm_prompt, [audio.audio_and_sample_rate
|
||||
for audio in audio_assets])],
|
||||
[(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
|
||||
MODEL_NAME,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
@@ -92,23 +101,22 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
|
||||
return audio_dict
|
||||
|
||||
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*audio_chunks,
|
||||
{
|
||||
"type":
|
||||
"text",
|
||||
"text":
|
||||
f"What's happening in these {len(audio_assets)} audio clips?"
|
||||
},
|
||||
],
|
||||
}]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*audio_chunks,
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"What's happening in these {len(audio_assets)} audio clips?",
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=10)
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME, messages=messages, max_tokens=10
|
||||
)
|
||||
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
|
||||
@@ -12,8 +12,7 @@ from ....utils import create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
PROMPTS = [
|
||||
{
|
||||
"prompt":
|
||||
"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
|
||||
"prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
|
||||
"multi_modal_data": {
|
||||
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
|
||||
},
|
||||
@@ -25,9 +24,8 @@ PROMPTS = [
|
||||
"audio": AudioAsset("winning_call").audio_and_sample_rate,
|
||||
},
|
||||
},
|
||||
"decoder_prompt":
|
||||
"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
|
||||
}
|
||||
"decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
|
||||
},
|
||||
]
|
||||
|
||||
EXPECTED = {
|
||||
@@ -41,7 +39,7 @@ EXPECTED = {
|
||||
" is June and the third base. They're going to wave him in. The throw"
|
||||
" to the plate will be late. The Mariners are going to play for the"
|
||||
" American League Championship. I don't believe it. It just continues"
|
||||
" by all five."
|
||||
" by all five.",
|
||||
],
|
||||
"openai/whisper-small": [
|
||||
" The first words I spoke in the original pornograph. A little piece"
|
||||
@@ -51,7 +49,7 @@ EXPECTED = {
|
||||
" comes joy. Here is Junior to third base. They're gonna wave him"
|
||||
" in. The throw to the plate will be late. The Mariners are going to"
|
||||
" play for the American League Championship. I don't believe it. It"
|
||||
" just continues. My, oh my."
|
||||
" just continues. My, oh my.",
|
||||
],
|
||||
"openai/whisper-medium": [
|
||||
" The first words I spoke in the original phonograph, a little piece"
|
||||
@@ -62,7 +60,7 @@ EXPECTED = {
|
||||
" Jorgen at third base. They're going to wave him in. The throw to the"
|
||||
" plate will be late. The Mariners are going to play for the American"
|
||||
" League Championship. I don't believe it. It just continues. My, oh"
|
||||
" my."
|
||||
" my.",
|
||||
],
|
||||
"openai/whisper-large-v3": [
|
||||
" The first words I spoke in the original phonograph, a little piece"
|
||||
@@ -73,7 +71,7 @@ EXPECTED = {
|
||||
" Junior to third base. They're going to wave him in. The throw to the"
|
||||
" plate will be late. The Mariners are going to play for the American"
|
||||
" League Championship. I don't believe it. It just continues. My, oh,"
|
||||
" my."
|
||||
" my.",
|
||||
],
|
||||
"openai/whisper-large-v3-turbo": [
|
||||
" The first words I spoke in the original phonograph, a little piece"
|
||||
@@ -84,8 +82,8 @@ EXPECTED = {
|
||||
" Junior to third base. They're going to wave him in. The throw to the"
|
||||
" plate will be late. The Mariners are going to play for the American"
|
||||
" League Championship. I don't believe it. It just continues. My, oh,"
|
||||
" my."
|
||||
]
|
||||
" my.",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@@ -100,11 +98,11 @@ def run_test(
|
||||
expected_list = EXPECTED[model] * 10
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype="half",
|
||||
max_model_len=448,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
model,
|
||||
dtype="half",
|
||||
max_model_len=448,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.llm
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Helpers for building inputs that can be leveraged for different test types.
|
||||
"""
|
||||
"""Helpers for building inputs that can be leveraged for different test types."""
|
||||
|
||||
from collections.abc import Iterable
|
||||
from pathlib import PosixPath
|
||||
from typing import Callable, Optional, Union
|
||||
@@ -10,20 +10,30 @@ import torch
|
||||
|
||||
from vllm.multimodal.audio import AudioResampler
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import (rescale_video_size, resize_video,
|
||||
sample_frames_from_video)
|
||||
from vllm.multimodal.video import (
|
||||
rescale_video_size,
|
||||
resize_video,
|
||||
sample_frames_from_video,
|
||||
)
|
||||
|
||||
from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
|
||||
from .types import (SINGLE_AUDIO_BASE_PROMPT, SINGLE_IMAGE_BASE_PROMPTS,
|
||||
TEST_AUDIO_PLACEHOLDER, TEST_IMG_PLACEHOLDER,
|
||||
TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
|
||||
ImageSizeWrapper, PromptWithMultiModalInput, SizeType,
|
||||
VLMTestInfo)
|
||||
from .types import (
|
||||
SINGLE_AUDIO_BASE_PROMPT,
|
||||
SINGLE_IMAGE_BASE_PROMPTS,
|
||||
TEST_AUDIO_PLACEHOLDER,
|
||||
TEST_IMG_PLACEHOLDER,
|
||||
TEST_VIDEO_PLACEHOLDER,
|
||||
VIDEO_BASE_PROMPT,
|
||||
ImageSizeWrapper,
|
||||
PromptWithMultiModalInput,
|
||||
SizeType,
|
||||
VLMTestInfo,
|
||||
)
|
||||
|
||||
|
||||
def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
|
||||
str],
|
||||
test_placeholder: str) -> str:
|
||||
def replace_test_placeholder(
|
||||
prompt: str, mm_idx_to_prompt: Callable[[int], str], test_placeholder: str
|
||||
) -> str:
|
||||
"""Given a prompt, replaces each test placeholder with the
|
||||
model-specific tag.
|
||||
"""
|
||||
@@ -35,11 +45,13 @@ def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
|
||||
return img_prompt
|
||||
|
||||
|
||||
def get_model_prompts(base_prompts: Iterable[str],
|
||||
img_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
video_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
audio_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
prompt_formatter: Callable[[str], str]) -> list[str]:
|
||||
def get_model_prompts(
|
||||
base_prompts: Iterable[str],
|
||||
img_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
video_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
audio_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
prompt_formatter: Callable[[str], str],
|
||||
) -> list[str]:
|
||||
"""Given a model-agnostic base prompt and test configuration for a model(s)
|
||||
to be tested, update the media placeholders and apply the prompt formatting
|
||||
to get the test prompt string for this model.
|
||||
@@ -56,19 +68,19 @@ def get_model_prompts(base_prompts: Iterable[str],
|
||||
# Replace the multimodal placeholders in the base prompt with
|
||||
# the correct ones for the model that we are testing
|
||||
if img_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(base_prompt,
|
||||
img_idx_to_prompt,
|
||||
TEST_IMG_PLACEHOLDER)
|
||||
base_prompt = replace_test_placeholder(
|
||||
base_prompt, img_idx_to_prompt, TEST_IMG_PLACEHOLDER
|
||||
)
|
||||
|
||||
if video_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(base_prompt,
|
||||
video_idx_to_prompt,
|
||||
TEST_VIDEO_PLACEHOLDER)
|
||||
base_prompt = replace_test_placeholder(
|
||||
base_prompt, video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER
|
||||
)
|
||||
|
||||
if audio_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(base_prompt,
|
||||
audio_idx_to_prompt,
|
||||
TEST_AUDIO_PLACEHOLDER)
|
||||
base_prompt = replace_test_placeholder(
|
||||
base_prompt, audio_idx_to_prompt, TEST_AUDIO_PLACEHOLDER
|
||||
)
|
||||
|
||||
# Apply the prompt formatter to wrap the base prompt with
|
||||
# the correct media placeholders to get the model test prompt
|
||||
@@ -84,14 +96,15 @@ def build_single_image_inputs_from_test_info(
|
||||
tmp_path: Optional[PosixPath] = None,
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError(
|
||||
"Prompt formatter must be set to build single image inputs")
|
||||
raise ValueError("Prompt formatter must be set to build single image inputs")
|
||||
|
||||
model_prompts = get_model_prompts(test_info.single_image_prompts,
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter)
|
||||
model_prompts = get_model_prompts(
|
||||
test_info.single_image_prompts,
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
# For models that require a local path / URL encoded in the image; export
|
||||
# assets and encode into tmp_path for this test. This should be avoided
|
||||
@@ -110,8 +123,8 @@ def build_single_image_inputs_from_test_info(
|
||||
|
||||
|
||||
def build_single_image_inputs(
|
||||
images, model_prompts,
|
||||
size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
|
||||
images, model_prompts, size_wrapper: ImageSizeWrapper
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
# For every image / prompt pair, get a pair containing two lists of
|
||||
# length size_factors, where the first contains duplicates of the model
|
||||
# prompt [str], and the second contains copies of the image after being
|
||||
@@ -125,7 +138,8 @@ def build_single_image_inputs(
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for size in size_wrapper.data
|
||||
],
|
||||
) for image, prompt in zip(images, model_prompts)
|
||||
)
|
||||
for image, prompt in zip(images, model_prompts)
|
||||
]
|
||||
|
||||
|
||||
@@ -136,14 +150,15 @@ def build_multi_image_inputs_from_test_info(
|
||||
tmp_path: Optional[PosixPath] = None,
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError(
|
||||
"Prompt formatter must be set to build multi image inputs")
|
||||
raise ValueError("Prompt formatter must be set to build multi image inputs")
|
||||
|
||||
model_prompts = get_model_prompts([test_info.multi_image_prompt],
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter)
|
||||
model_prompts = get_model_prompts(
|
||||
[test_info.multi_image_prompt],
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
if test_info.prompt_path_encoder is not None:
|
||||
if tmp_path is None:
|
||||
@@ -164,16 +179,20 @@ def build_multi_image_inputs_from_test_info(
|
||||
|
||||
|
||||
def build_multi_image_inputs(
|
||||
image_lists, model_prompts,
|
||||
size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
|
||||
image_lists, model_prompts, size_wrapper: ImageSizeWrapper
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=[prompt for _ in size_wrapper.data],
|
||||
image_data=[[
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for image in images
|
||||
] for size in size_wrapper.data],
|
||||
) for images, prompt in zip(image_lists, model_prompts)
|
||||
image_data=[
|
||||
[
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for image in images
|
||||
]
|
||||
for size in size_wrapper.data
|
||||
],
|
||||
)
|
||||
for images, prompt in zip(image_lists, model_prompts)
|
||||
]
|
||||
|
||||
|
||||
@@ -185,10 +204,10 @@ def build_embedding_inputs_from_test_info(
|
||||
# These conditions will always be true if invoked through filtering,
|
||||
# but we still check them in case this is ever called directly
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError(
|
||||
"Prompt formatter must be set to build image embedding inputs")
|
||||
if size_wrapper.type != SizeType.SIZE_FACTOR or not \
|
||||
all(factor == 1.0 for factor in size_wrapper.data):
|
||||
raise ValueError("Prompt formatter must be set to build image embedding inputs")
|
||||
if size_wrapper.type != SizeType.SIZE_FACTOR or not all(
|
||||
factor == 1.0 for factor in size_wrapper.data
|
||||
):
|
||||
raise ValueError("Embedding tests require constant (1.0) size factors")
|
||||
if test_info.convert_assets_to_embeddings is None:
|
||||
raise ValueError("No conversion func for getting embeddings found")
|
||||
@@ -209,8 +228,7 @@ def build_embedding_inputs_from_test_info(
|
||||
assert len(images) == len(model_prompts)
|
||||
|
||||
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
|
||||
vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
|
||||
size_wrapper)
|
||||
vllm_embeddings = build_single_image_inputs(embeds, model_prompts, size_wrapper)
|
||||
return inputs, vllm_embeddings
|
||||
|
||||
|
||||
@@ -235,21 +253,22 @@ def build_video_inputs_from_test_info(
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
|
||||
else rescale_video_size)
|
||||
video_scaler = (
|
||||
resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size
|
||||
)
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=[prompt for _ in size_wrapper.data],
|
||||
video_data=[
|
||||
video_scaler(video, size) for size in size_wrapper.data
|
||||
],
|
||||
) for video, prompt in zip(sampled_vids, model_prompts)
|
||||
video_data=[video_scaler(video, size) for size in size_wrapper.data],
|
||||
)
|
||||
for video, prompt in zip(sampled_vids, model_prompts)
|
||||
]
|
||||
|
||||
|
||||
def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
|
||||
size_type: SizeType):
|
||||
def apply_image_size_scaling(
|
||||
image, size: Union[float, tuple[int, int]], size_type: SizeType
|
||||
):
|
||||
"""Applies a size scaler to one image; this can be an image size factor,
|
||||
which scales the image while maintaining the aspect ratio"""
|
||||
# Special case for embeddings; if it's a tensor, it's only valid if we
|
||||
@@ -285,13 +304,16 @@ def build_audio_inputs_from_test_info(
|
||||
method="librosa",
|
||||
)
|
||||
audios = [asset.audio_and_sample_rate for asset in audio_assets]
|
||||
resampled_audios = [(
|
||||
resampler.resample(
|
||||
audio,
|
||||
orig_sr=sr,
|
||||
),
|
||||
int(resampler.target_sr),
|
||||
) for audio, sr in audios]
|
||||
resampled_audios = [
|
||||
(
|
||||
resampler.resample(
|
||||
audio,
|
||||
orig_sr=sr,
|
||||
),
|
||||
int(resampler.target_sr),
|
||||
)
|
||||
for audio, sr in audios
|
||||
]
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
|
||||
@@ -4,19 +4,28 @@
|
||||
modality, getting all combinations (similar to pytest's parametrization),
|
||||
handling multimodal placeholder substitution, and so on.
|
||||
"""
|
||||
|
||||
import itertools
|
||||
from collections import OrderedDict
|
||||
from collections.abc import Iterable
|
||||
|
||||
import pytest
|
||||
|
||||
from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
|
||||
ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
|
||||
from .types import (
|
||||
EMBEDDING_SIZE_FACTORS,
|
||||
ExpandableVLMTestArgs,
|
||||
ImageSizeWrapper,
|
||||
SizeType,
|
||||
VLMTestInfo,
|
||||
VLMTestType,
|
||||
)
|
||||
|
||||
|
||||
def get_filtered_test_settings(
|
||||
test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
|
||||
new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
|
||||
test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
new_proc_per_test: bool,
|
||||
) -> dict[str, VLMTestInfo]:
|
||||
"""Given the dict of potential test settings to run, return a subdict
|
||||
of tests who have the current test type enabled with the matching val for
|
||||
fork_per_test.
|
||||
@@ -25,7 +34,8 @@ def get_filtered_test_settings(
|
||||
def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
|
||||
return test_info.test_type == test_type or (
|
||||
isinstance(test_info.test_type, Iterable)
|
||||
and test_type in test_info.test_type)
|
||||
and test_type in test_info.test_type
|
||||
)
|
||||
|
||||
matching_tests = {}
|
||||
for test_name, test_info in test_settings.items():
|
||||
@@ -36,62 +46,69 @@ def get_filtered_test_settings(
|
||||
assert test_info.convert_assets_to_embeddings is not None
|
||||
# Custom test inputs need to explicitly define the mm limit/inputs
|
||||
if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
|
||||
assert (test_info.custom_test_opts is not None
|
||||
and isinstance(test_info.custom_test_opts, Iterable))
|
||||
assert test_info.custom_test_opts is not None and isinstance(
|
||||
test_info.custom_test_opts, Iterable
|
||||
)
|
||||
# For all types besides custom inputs, we need a prompt formatter
|
||||
else:
|
||||
assert test_info.prompt_formatter is not None
|
||||
|
||||
# Everything looks okay; keep if this is correct proc handling
|
||||
if (test_info.distributed_executor_backend
|
||||
is not None) == new_proc_per_test:
|
||||
if (
|
||||
test_info.distributed_executor_backend is not None
|
||||
) == new_proc_per_test:
|
||||
matching_tests[test_name] = test_info
|
||||
|
||||
return matching_tests
|
||||
|
||||
|
||||
def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
create_new_process_for_each_test: bool):
|
||||
def get_parametrized_options(
|
||||
test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
create_new_process_for_each_test: bool,
|
||||
):
|
||||
"""Converts all of our VLMTestInfo into an expanded list of parameters.
|
||||
This is similar to nesting pytest parametrize calls, but done directly
|
||||
through an itertools product so that each test can set things like
|
||||
size factors etc, while still running in isolated test cases.
|
||||
"""
|
||||
matching_tests = get_filtered_test_settings(
|
||||
test_settings, test_type, create_new_process_for_each_test)
|
||||
test_settings, test_type, create_new_process_for_each_test
|
||||
)
|
||||
|
||||
# Ensure that something is wrapped as an iterable it's not already
|
||||
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
|
||||
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
|
||||
|
||||
def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
|
||||
# This is essentially the same as nesting a bunch of mark.parametrize
|
||||
# decorators, but we do it programmatically to allow overrides for on
|
||||
# a per-model basis, while still being able to execute each of these
|
||||
# as individual test cases in pytest.
|
||||
iter_kwargs = OrderedDict([
|
||||
("model", ensure_wrapped(test_info.models)),
|
||||
("max_tokens", ensure_wrapped(test_info.max_tokens)),
|
||||
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
|
||||
("dtype", ensure_wrapped(test_info.dtype)),
|
||||
("distributed_executor_backend",
|
||||
ensure_wrapped(test_info.distributed_executor_backend)),
|
||||
])
|
||||
iter_kwargs = OrderedDict(
|
||||
[
|
||||
("model", ensure_wrapped(test_info.models)),
|
||||
("max_tokens", ensure_wrapped(test_info.max_tokens)),
|
||||
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
|
||||
("dtype", ensure_wrapped(test_info.dtype)),
|
||||
(
|
||||
"distributed_executor_backend",
|
||||
ensure_wrapped(test_info.distributed_executor_backend),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
# num_frames is video only
|
||||
if test_type == VLMTestType.VIDEO:
|
||||
iter_kwargs["num_video_frames"] = ensure_wrapped(
|
||||
test_info.num_video_frames)
|
||||
iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
|
||||
|
||||
# No sizes passed for custom inputs, since inputs are directly provided
|
||||
if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
|
||||
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
|
||||
if wrapped_sizes is None:
|
||||
raise ValueError(
|
||||
f"Sizes must be set for test type {test_type}")
|
||||
raise ValueError(f"Sizes must be set for test type {test_type}")
|
||||
iter_kwargs["size_wrapper"] = wrapped_sizes
|
||||
|
||||
#Otherwise expand the custom test options instead
|
||||
# Otherwise expand the custom test options instead
|
||||
elif test_type == VLMTestType.CUSTOM_INPUTS:
|
||||
if test_info.custom_test_opts is None:
|
||||
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
|
||||
@@ -121,8 +138,8 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
|
||||
|
||||
|
||||
def get_wrapped_test_sizes(
|
||||
test_info: VLMTestInfo,
|
||||
test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]:
|
||||
test_info: VLMTestInfo, test_type: VLMTestType
|
||||
) -> tuple[ImageSizeWrapper, ...]:
|
||||
"""Given a test info which may have size factors or fixed sizes, wrap them
|
||||
and combine them into an iterable, each of which will be used in parameter
|
||||
expansion.
|
||||
@@ -133,18 +150,18 @@ def get_wrapped_test_sizes(
|
||||
"""
|
||||
# If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
|
||||
if test_type == VLMTestType.EMBEDDING:
|
||||
return tuple([
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
for factor in EMBEDDING_SIZE_FACTORS
|
||||
])
|
||||
return tuple(
|
||||
[
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
for factor in EMBEDDING_SIZE_FACTORS
|
||||
]
|
||||
)
|
||||
# Audio and Custom inputs have preprocessed inputs
|
||||
elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
|
||||
return tuple()
|
||||
|
||||
size_factors = test_info.image_size_factors \
|
||||
if test_info.image_size_factors else []
|
||||
fixed_sizes = test_info.image_sizes \
|
||||
if test_info.image_sizes else []
|
||||
size_factors = test_info.image_size_factors if test_info.image_size_factors else []
|
||||
fixed_sizes = test_info.image_sizes if test_info.image_sizes else []
|
||||
|
||||
wrapped_factors = [
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
@@ -152,8 +169,7 @@ def get_wrapped_test_sizes(
|
||||
]
|
||||
|
||||
wrapped_sizes = [
|
||||
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
|
||||
for size in fixed_sizes
|
||||
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size) for size in fixed_sizes
|
||||
]
|
||||
|
||||
return tuple(wrapped_factors + wrapped_sizes)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Core test implementation to be shared across modalities."""
|
||||
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
import torch
|
||||
@@ -70,22 +71,23 @@ def run_test(
|
||||
if model_info.hf_overrides:
|
||||
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
|
||||
if model_info.skip_tokenizer_init:
|
||||
vllm_runner_kwargs_[
|
||||
"skip_tokenizer_init"] = model_info.skip_tokenizer_init
|
||||
vllm_runner_kwargs_["skip_tokenizer_init"] = model_info.skip_tokenizer_init
|
||||
|
||||
if vllm_runner_kwargs:
|
||||
vllm_runner_kwargs_.update(vllm_runner_kwargs)
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=max_num_seqs,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=enforce_eager,
|
||||
runner=runner,
|
||||
**vllm_runner_kwargs_) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=max_num_seqs,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=enforce_eager,
|
||||
runner=runner,
|
||||
**vllm_runner_kwargs_,
|
||||
) as vllm_model:
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
|
||||
vllm_kwargs: dict[str, Any] = {}
|
||||
@@ -95,21 +97,19 @@ def run_test(
|
||||
vllm_kwargs["stop"] = stop_str
|
||||
|
||||
for prompts, image_data, video_data, audio_data in vllm_inputs:
|
||||
mm_data = dict(images=image_data,
|
||||
videos=video_data,
|
||||
audios=audio_data)
|
||||
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
|
||||
vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
|
||||
vllm_output = vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
**vllm_kwargs_with_mm_data)
|
||||
**vllm_kwargs_with_mm_data,
|
||||
)
|
||||
vllm_outputs_per_mm.append(vllm_output)
|
||||
|
||||
hf_model = hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=auto_cls,
|
||||
model_kwargs=hf_model_kwargs)
|
||||
hf_model = hf_runner(
|
||||
model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
|
||||
)
|
||||
|
||||
# Some models need to patch things like the model processor, e.g., internvl
|
||||
if patch_hf_runner is not None:
|
||||
@@ -129,16 +129,15 @@ def run_test(
|
||||
hf_kwargs["stop_strings"] = stop_str
|
||||
|
||||
for prompts, image_data, video_data, audio_data in inputs:
|
||||
mm_data = dict(images=image_data,
|
||||
videos=video_data,
|
||||
audios=audio_data)
|
||||
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
|
||||
hf_kwargs_with_mm_data = hf_kwargs | mm_data
|
||||
hf_output = hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tokenizer=tokenizer,
|
||||
**hf_kwargs_with_mm_data)
|
||||
**hf_kwargs_with_mm_data,
|
||||
)
|
||||
hf_outputs_per_mm.append(hf_output)
|
||||
|
||||
# Apply output processing / sanitation to the vLLM and HF runner results
|
||||
@@ -150,8 +149,7 @@ def run_test(
|
||||
second_runner_processor=vllm_output_post_proc,
|
||||
)
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
|
||||
vllm_outputs_per_mm):
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm, vllm_outputs_per_mm):
|
||||
# This is usually check_logprobs_close, but it's passed through to
|
||||
# allow things like check_outputs_equal where needed
|
||||
comparator(
|
||||
@@ -171,15 +169,19 @@ def process_runner_outputs(
|
||||
):
|
||||
"""Applies the runner processor(s) to the runner outputs, if any."""
|
||||
if first_runner_processor is not None:
|
||||
first_runner_outputs = process_outputs(first_runner_processor, model,
|
||||
first_runner_outputs)
|
||||
first_runner_outputs = process_outputs(
|
||||
first_runner_processor, model, first_runner_outputs
|
||||
)
|
||||
if second_runner_processor is not None:
|
||||
second_runner_outputs = process_outputs(second_runner_processor, model,
|
||||
second_runner_outputs)
|
||||
second_runner_outputs = process_outputs(
|
||||
second_runner_processor, model, second_runner_outputs
|
||||
)
|
||||
return first_runner_outputs, second_runner_outputs
|
||||
|
||||
|
||||
def process_outputs(output_processor, model, outputs_per_image):
|
||||
"""Applies a model specific post-processor function to a runner's output"""
|
||||
return [[output_processor(res, model) for res in outputs]
|
||||
for outputs in outputs_per_image]
|
||||
return [
|
||||
[output_processor(res, model) for res in outputs]
|
||||
for outputs in outputs_per_image
|
||||
]
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Custom input builders for edge-cases in different models."""
|
||||
|
||||
from typing import Callable
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import (rescale_video_size, resize_video,
|
||||
sample_frames_from_video)
|
||||
from vllm.multimodal.video import (
|
||||
rescale_video_size,
|
||||
resize_video,
|
||||
sample_frames_from_video,
|
||||
)
|
||||
|
||||
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
|
||||
from .builders import build_multi_image_inputs, build_single_image_inputs
|
||||
@@ -15,7 +19,7 @@ from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType
|
||||
|
||||
def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
|
||||
"""Builds inputs for multi-image (varied sizes/aspect ratio) testing.
|
||||
|
||||
|
||||
Args:
|
||||
formatter: model-specific prompt formatter.
|
||||
"""
|
||||
@@ -41,7 +45,7 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
|
||||
stop_sign,
|
||||
rescale_image_size(stop_sign, 0.25),
|
||||
cherry_blossom.resize((183, 488)),
|
||||
cherry_blossom.resize((488, 183))
|
||||
cherry_blossom.resize((488, 183)),
|
||||
],
|
||||
cherry_blossom,
|
||||
]
|
||||
@@ -54,10 +58,11 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
|
||||
]
|
||||
|
||||
|
||||
def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
|
||||
num_frames: int = 16):
|
||||
def multi_video_multi_aspect_ratio_inputs(
|
||||
formatter: Callable[[str], str], num_frames: int = 16
|
||||
):
|
||||
"""Builds inputs for multi-video (varied sizes/aspect ratio) testing.
|
||||
|
||||
|
||||
Args:
|
||||
formatter: model-specific prompt formatter.
|
||||
"""
|
||||
@@ -81,7 +86,7 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
|
||||
video,
|
||||
rescale_video_size(video, 0.25),
|
||||
resize_video(video, (183, 488)),
|
||||
resize_video(video, (488, 183))
|
||||
resize_video(video, (488, 183)),
|
||||
],
|
||||
video,
|
||||
]
|
||||
@@ -96,7 +101,9 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
|
||||
|
||||
def different_patch_input_cases_internvl():
|
||||
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
|
||||
formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
|
||||
formatter = (
|
||||
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"
|
||||
) # noqa: E501
|
||||
single_img_prompts = [
|
||||
"<image>\nWhat's the content in the center of the image?",
|
||||
"<image>\nWhat is the season?",
|
||||
@@ -115,14 +122,14 @@ def different_patch_input_cases_internvl():
|
||||
|
||||
|
||||
def windows_attention_image_qwen2_5_vl():
|
||||
|
||||
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
|
||||
image = ImageAsset("hato").pil_image
|
||||
|
||||
question = "Describe the image."
|
||||
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
prompt = (
|
||||
f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
|
||||
return build_single_image_inputs([image], [prompt], wrapped_sf)
|
||||
@@ -136,8 +143,9 @@ def video_with_metadata_glm4_1v():
|
||||
formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
|
||||
|
||||
scales = [0.1, 0.2, 0.25]
|
||||
video_input = [[(rescale_video_size(video_array, scale), metadata)]
|
||||
for scale in scales]
|
||||
video_input = [
|
||||
[(rescale_video_size(video_array, scale), metadata)] for scale in scales
|
||||
]
|
||||
prompts = [formatted_prompt] * len(video_input)
|
||||
|
||||
return [
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
for manipulating the input / output of HF & vLLM test runners, which are
|
||||
typically specific to a small subset of models.
|
||||
"""
|
||||
|
||||
import types
|
||||
from pathlib import PosixPath
|
||||
from typing import Optional, Union
|
||||
@@ -15,8 +16,13 @@ import pytest
|
||||
import regex as re
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
|
||||
GenerationConfig, GenerationMixin)
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
BatchFeature,
|
||||
GenerationConfig,
|
||||
GenerationMixin,
|
||||
)
|
||||
from transformers.video_utils import VideoMetadata
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
@@ -27,8 +33,7 @@ from .types import RunnerOutput
|
||||
|
||||
|
||||
####### vLLM output processors functions
|
||||
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [blip2 models] to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -42,8 +47,7 @@ def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [fuyu models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -53,8 +57,8 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
|
||||
|
||||
def qwen_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [qwen models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -64,8 +68,8 @@ def qwen_vllm_to_hf_output(
|
||||
|
||||
|
||||
def qwen2_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [qwen2 models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -75,8 +79,8 @@ def qwen2_vllm_to_hf_output(
|
||||
|
||||
|
||||
def kimiv_vl_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -85,23 +89,25 @@ def kimiv_vl_vllm_to_hf_output(
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def llava_image_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> RunnerOutput:
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
mm_token_id = config.image_token_index
|
||||
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
|
||||
|
||||
|
||||
def llava_video_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
mm_token_id = config.video_token_index
|
||||
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
|
||||
|
||||
|
||||
def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
|
||||
mm_token_id: int) -> RunnerOutput:
|
||||
def _llava_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput, model: str, mm_token_id: int
|
||||
) -> RunnerOutput:
|
||||
"""Sanitize vllm output [Llava models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -109,7 +115,8 @@ def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
token_id
|
||||
for idx, token_id in enumerate(output_ids)
|
||||
if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
|
||||
]
|
||||
|
||||
@@ -128,8 +135,9 @@ def llava_onevision_hf_model_kwargs(model: str) -> dict:
|
||||
return config.to_dict()
|
||||
|
||||
|
||||
def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def llava_onevision_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> RunnerOutput:
|
||||
"""Sanitize vllm output [llava-onevision] to compare with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -140,7 +148,8 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
token_id
|
||||
for idx, token_id in enumerate(output_ids)
|
||||
if token_id != video_token_id or output_ids[idx - 1] != video_token_id
|
||||
]
|
||||
|
||||
@@ -151,8 +160,7 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def mantis_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [mantis] to compare with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -161,8 +169,7 @@ def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [phi3v] to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -180,8 +187,7 @@ def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -192,7 +198,8 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
token_id
|
||||
for idx, token_id in enumerate(output_ids)
|
||||
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
|
||||
]
|
||||
|
||||
@@ -205,46 +212,40 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
|
||||
|
||||
####### Post-processors for HF outputs
|
||||
def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<|end▁of▁sentence|>"):
|
||||
output_str = output_str.split("<|end▁of▁sentence|>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def idefics3_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def idefics3_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<end_of_utterance>"):
|
||||
output_str = output_str.split("<end_of_utterance>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def smolvlm_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
# Based on Idefics3
|
||||
return idefics3_trunc_hf_output(hf_output, model)
|
||||
|
||||
|
||||
def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<|eot_id|>"):
|
||||
output_str = output_str.split("<|eot_id|>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def minimax_vl_01_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def minimax_vl_01_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<end_of_sentence>"):
|
||||
output_str = output_str.split("<end_of_sentence>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def ultravox_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def ultravox_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
@@ -262,8 +263,8 @@ def get_llava_embeddings(image_assets: ImageTestAssets):
|
||||
|
||||
####### Prompt path encoders for models that need models on disk
|
||||
def qwen_prompt_path_encoder(
|
||||
tmp_path: PosixPath, prompt: str,
|
||||
assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
|
||||
tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], ImageTestAssets]
|
||||
) -> str:
|
||||
"""Given a temporary dir path, export one or more image assets into the
|
||||
tempdir & replace its contents with the local path to the string so that
|
||||
the HF version of Qwen-VL can resolve the path and load the image in its
|
||||
@@ -313,8 +314,9 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
return BatchFeature(data=inputs, tensor_type="pt")
|
||||
|
||||
hf_model.processor = processor
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language.model.embed_tokens
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.language.model.embed_tokens
|
||||
)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -357,11 +359,10 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
assert len(contents) == len(images)
|
||||
|
||||
return hf_processor.apply_chat_template(
|
||||
[{
|
||||
"role": "user",
|
||||
"image": image,
|
||||
"content": content
|
||||
} for image, content in zip(images, contents)],
|
||||
[
|
||||
{"role": "user", "image": image, "content": content}
|
||||
for image, content in zip(images, contents)
|
||||
],
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
@@ -369,8 +370,9 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
)
|
||||
|
||||
hf_model.processor = processor
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.transformer.output_layer
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.transformer.output_layer
|
||||
)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -387,10 +389,9 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
else:
|
||||
video_metadata = None
|
||||
|
||||
return hf_processor(*args,
|
||||
videos=videos,
|
||||
video_metadata=video_metadata,
|
||||
**kwargs)
|
||||
return hf_processor(
|
||||
*args, videos=videos, video_metadata=video_metadata, **kwargs
|
||||
)
|
||||
|
||||
hf_model.processor = processor
|
||||
return hf_model
|
||||
@@ -406,8 +407,9 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.config = AutoConfig.from_pretrained(
|
||||
hf_runner.model_name, trust_remote_code=True
|
||||
)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.use_msac = self.config.use_msac
|
||||
@@ -415,11 +417,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.max_num = self.config.max_dynamic_patch
|
||||
self.image_size = self.vision_config.image_size
|
||||
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]],
|
||||
**kwargs):
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
|
||||
# yapf: disable
|
||||
from vllm.model_executor.models.h2ovl import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
image_to_pixel_values_h2ovl,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
@@ -431,29 +436,26 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
use_msac=self.use_msac,
|
||||
) for image in images
|
||||
]
|
||||
num_patches_list = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
for num_patches in num_patches_list:
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
text = text.replace("<image>", image_tokens, 1)
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = H2OVLProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.language_model.get_output_embeddings()
|
||||
)
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -467,19 +469,23 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.config = AutoConfig.from_pretrained(
|
||||
hf_runner.model_name, trust_remote_code=True
|
||||
)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.min_num = self.config.min_dynamic_patch
|
||||
self.max_num = self.config.max_dynamic_patch
|
||||
self.image_size = self.vision_config.image_size
|
||||
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]],
|
||||
**kwargs):
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
|
||||
from vllm.model_executor.models.skyworkr1v import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START,
|
||||
image_to_pixel_values_skyworkr1v)
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
image_to_pixel_values_skyworkr1v,
|
||||
)
|
||||
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
pixel_values = [
|
||||
image_to_pixel_values_skyworkr1v(
|
||||
@@ -488,29 +494,26 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
min_num=self.min_num,
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
) for image in images
|
||||
]
|
||||
num_patches_list = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
for num_patches in num_patches_list:
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
text = text.replace("<image>", image_tokens, 1)
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = SkyworkR1VProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.language_model.get_output_embeddings()
|
||||
)
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -524,8 +527,9 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.config = AutoConfig.from_pretrained(
|
||||
hf_runner.model_name, trust_remote_code=True
|
||||
)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.min_num = self.config.min_dynamic_patch
|
||||
@@ -540,8 +544,13 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
**kwargs,
|
||||
):
|
||||
from vllm.model_executor.models.internvl import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START,
|
||||
image_to_pixel_values_internvl, video_to_pixel_values_internvl)
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
image_to_pixel_values_internvl,
|
||||
video_to_pixel_values_internvl,
|
||||
)
|
||||
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
videos = [videos] if isinstance(videos, np.ndarray) else videos
|
||||
if images is not None:
|
||||
@@ -552,7 +561,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
min_num=self.min_num,
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
) for image in images
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
num_patches_images = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values_images
|
||||
@@ -568,7 +578,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
min_num=1,
|
||||
max_num=1,
|
||||
use_thumbnail=False,
|
||||
) for video in videos
|
||||
)
|
||||
for video in videos
|
||||
]
|
||||
num_patches_videos = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values_videos
|
||||
@@ -580,38 +591,37 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
while ("<image>" in text) or ("<video>" in text):
|
||||
image_index = text.find("<image>")
|
||||
video_index = text.find("<video>")
|
||||
if image_index == -1 or (video_index > -1
|
||||
and video_index < image_index):
|
||||
if image_index == -1 or (
|
||||
video_index > -1 and video_index < image_index
|
||||
):
|
||||
num_patches = num_patches_videos.pop(0)
|
||||
pixel_values.append(pixel_values_videos.pop(0))
|
||||
context_tokens = IMG_START + \
|
||||
IMG_CONTEXT * self.num_image_token + IMG_END
|
||||
video_tokens = ''.join([
|
||||
f'Frame{i+1}: {context_tokens}'
|
||||
for i in range(num_patches)
|
||||
])
|
||||
text = text.replace('<video>', video_tokens, 1)
|
||||
context_tokens = (
|
||||
IMG_START + IMG_CONTEXT * self.num_image_token + IMG_END
|
||||
)
|
||||
video_tokens = "".join(
|
||||
[f"Frame{i + 1}: {context_tokens}" for i in range(num_patches)]
|
||||
)
|
||||
text = text.replace("<video>", video_tokens, 1)
|
||||
else:
|
||||
num_patches = num_patches_images.pop(0)
|
||||
pixel_values.append(pixel_values_images.pop(0))
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
text = text.replace("<image>", image_tokens, 1)
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = InternVLProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.language_model.get_output_embeddings()
|
||||
)
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -631,7 +641,7 @@ def _internvl_generate(
|
||||
input_embeds = input_embeds.reshape(B * N, C)
|
||||
|
||||
input_ids = input_ids.reshape(B * N)
|
||||
selected = (input_ids == self.img_context_token_id)
|
||||
selected = input_ids == self.img_context_token_id
|
||||
assert selected.sum() != 0
|
||||
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
|
||||
|
||||
@@ -778,8 +788,9 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
|
||||
def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.llm.get_output_embeddings()
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.llm.get_output_embeddings()
|
||||
)
|
||||
|
||||
def processor(*args, text="", images=None, **kwargs):
|
||||
text_tokenizer = hf_model.model.get_text_tokenizer()
|
||||
@@ -787,8 +798,7 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
|
||||
prompt_start_and_end = {
|
||||
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
|
||||
"llama":
|
||||
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
|
||||
"llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
|
||||
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
|
||||
}
|
||||
for start, end in prompt_start_and_end.values():
|
||||
@@ -797,7 +807,8 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
break
|
||||
|
||||
prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
|
||||
text_or_conversations=text, images=images)
|
||||
text_or_conversations=text, images=images
|
||||
)
|
||||
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
|
||||
|
||||
inputs = {
|
||||
@@ -813,8 +824,9 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
|
||||
def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.llm.get_output_embeddings()
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.llm.get_output_embeddings()
|
||||
)
|
||||
|
||||
def processor(*args, text="", images=None, videos=None, **kwargs):
|
||||
if images is None:
|
||||
@@ -825,13 +837,11 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
videos = []
|
||||
else:
|
||||
videos = [videos] if isinstance(videos, np.ndarray) else videos
|
||||
videos = [[PIL.Image.fromarray(frame) for frame in vid]
|
||||
for vid in videos]
|
||||
videos = [[PIL.Image.fromarray(frame) for frame in vid] for vid in videos]
|
||||
|
||||
prompt_start_and_end = {
|
||||
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
|
||||
"llama":
|
||||
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
|
||||
"llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
|
||||
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
|
||||
}
|
||||
for start, end in prompt_start_and_end.values():
|
||||
@@ -842,21 +852,20 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
images_message = [{"type": "image", "image": img} for img in images]
|
||||
videos_message = [{"type": "video", "video": vid} for vid in videos]
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*images_message,
|
||||
*videos_message,
|
||||
{
|
||||
"type": "text",
|
||||
"text": text
|
||||
},
|
||||
],
|
||||
}]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*images_message,
|
||||
*videos_message,
|
||||
{"type": "text", "text": text},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
|
||||
messages=messages, enable_thinking=True)
|
||||
messages=messages, enable_thinking=True
|
||||
)
|
||||
inputs = {
|
||||
"inputs": input_ids,
|
||||
"pixel_values": pixel_values,
|
||||
|
||||
@@ -3,23 +3,34 @@
|
||||
"""Entrypoints for wrapping the core run_test implementation for specific test
|
||||
types / modalities.
|
||||
"""
|
||||
|
||||
from pathlib import PosixPath
|
||||
|
||||
from .....conftest import (AudioTestAssets, HfRunner, ImageTestAssets,
|
||||
VideoTestAssets, VllmRunner)
|
||||
from .....conftest import (
|
||||
AudioTestAssets,
|
||||
HfRunner,
|
||||
ImageTestAssets,
|
||||
VideoTestAssets,
|
||||
VllmRunner,
|
||||
)
|
||||
from . import builders, core
|
||||
from .types import ExpandableVLMTestArgs, VLMTestInfo
|
||||
|
||||
|
||||
####### Entrypoints for running different test types
|
||||
def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets):
|
||||
def run_single_image_test(
|
||||
*,
|
||||
tmp_path: PosixPath,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs = builders.build_single_image_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
@@ -31,17 +42,23 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets):
|
||||
def run_multi_image_test(
|
||||
*,
|
||||
tmp_path: PosixPath,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs = builders.build_multi_image_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
@@ -53,17 +70,22 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": len(image_assets)},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_embedding_test(*, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets):
|
||||
def run_embedding_test(
|
||||
*,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper)
|
||||
model_test_info, image_assets, test_case.size_wrapper
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
@@ -76,7 +98,8 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
vllm_embeddings=vllm_embeddings,
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_video_test(
|
||||
@@ -90,8 +113,11 @@ def run_video_test(
|
||||
assert test_case.size_wrapper is not None
|
||||
assert test_case.num_video_frames is not None
|
||||
inputs = builders.build_video_inputs_from_test_info(
|
||||
model_test_info, video_assets, test_case.size_wrapper,
|
||||
test_case.num_video_frames)
|
||||
model_test_info,
|
||||
video_assets,
|
||||
test_case.size_wrapper,
|
||||
test_case.num_video_frames,
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
@@ -103,7 +129,8 @@ def run_video_test(
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"video": len(video_assets)},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_audio_test(
|
||||
@@ -114,8 +141,7 @@ def run_audio_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets,
|
||||
):
|
||||
inputs = builders.build_audio_inputs_from_test_info(
|
||||
model_test_info, audio_assets)
|
||||
inputs = builders.build_audio_inputs_from_test_info(model_test_info, audio_assets)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
@@ -127,13 +153,17 @@ def run_audio_test(
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"audio": 1},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner]):
|
||||
def run_custom_inputs_test(
|
||||
*,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
):
|
||||
# Custom test cases can provide inputs directly, but they need to
|
||||
# explicitly provided a CustomTestConfig, which wraps the inputs and
|
||||
# the limit_mm_per_prompt
|
||||
@@ -155,4 +185,5 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Types for writing multimodal model tests."""
|
||||
|
||||
from collections.abc import Iterable
|
||||
from enum import Enum
|
||||
from pathlib import PosixPath
|
||||
@@ -15,9 +16,16 @@ from vllm.config import RunnerOption
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset,
|
||||
ImageTestAssets, PromptAudioInput, PromptImageInput,
|
||||
PromptVideoInput)
|
||||
from .....conftest import (
|
||||
AUDIO_ASSETS,
|
||||
IMAGE_ASSETS,
|
||||
HfRunner,
|
||||
ImageAsset,
|
||||
ImageTestAssets,
|
||||
PromptAudioInput,
|
||||
PromptImageInput,
|
||||
PromptVideoInput,
|
||||
)
|
||||
from ....utils import check_logprobs_close
|
||||
|
||||
# meta image tag; will be replaced by the appropriate tag for the model
|
||||
@@ -47,6 +55,7 @@ RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
|
||||
|
||||
class PromptWithMultiModalInput(NamedTuple):
|
||||
"""Holds the multimodal input for a single test case."""
|
||||
|
||||
prompts: list[str]
|
||||
image_data: Optional[PromptImageInput] = None
|
||||
video_data: Optional[PromptVideoInput] = None
|
||||
@@ -100,8 +109,9 @@ class VLMTestInfo(NamedTuple):
|
||||
|
||||
# Function for converting ImageAssets to image embeddings;
|
||||
# We need to define this explicitly for embedding tests
|
||||
convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
|
||||
list[torch.Tensor]]] = None
|
||||
convert_assets_to_embeddings: Optional[
|
||||
Callable[[ImageTestAssets], list[torch.Tensor]]
|
||||
] = None
|
||||
|
||||
# Exposed options for vLLM runner; we change these in a several tests,
|
||||
# but the defaults are derived from VllmRunner & the engine defaults
|
||||
@@ -156,8 +166,8 @@ class VLMTestInfo(NamedTuple):
|
||||
# for Qwen-VL, which requires encoding the image path / url into the prompt
|
||||
# for HF runner
|
||||
prompt_path_encoder: Optional[
|
||||
Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]],
|
||||
str]] = None # noqa: E501
|
||||
Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]], str]
|
||||
] = None # noqa: E501
|
||||
|
||||
# Allows configuring a test to run with custom inputs
|
||||
custom_test_opts: Optional[list[CustomTestOptions]] = None
|
||||
@@ -190,6 +200,7 @@ class VLMTestInfo(NamedTuple):
|
||||
|
||||
class ExpandableVLMTestArgs(NamedTuple):
|
||||
"""The expanded kwargs which correspond to a single test case."""
|
||||
|
||||
model: str
|
||||
max_tokens: int
|
||||
num_logprobs: int
|
||||
|
||||
@@ -12,10 +12,12 @@ HF_TEXT_PROMPTS = [
|
||||
"a photo of a cherry blossom",
|
||||
]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign": "",
|
||||
"cherry_blossom": "",
|
||||
})
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "",
|
||||
"cherry_blossom": "",
|
||||
}
|
||||
)
|
||||
|
||||
MODELS = ["openai/clip-vit-base-patch32"]
|
||||
|
||||
@@ -33,11 +35,9 @@ def _run_test(
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
max_model_len=77) as vllm_model:
|
||||
with vllm_runner(
|
||||
model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=77
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
|
||||
|
||||
with hf_runner(model, dtype=dtype, auto_cls=CLIPModel) as hf_model:
|
||||
@@ -48,10 +48,12 @@ def _run_test(
|
||||
if "pixel_values" in inputs:
|
||||
inputs.pop("input_ids")
|
||||
pooled_output = hf_model.model.get_image_features(
|
||||
**hf_model.wrap_device(inputs)).squeeze(0)
|
||||
**hf_model.wrap_device(inputs)
|
||||
).squeeze(0)
|
||||
else:
|
||||
pooled_output = hf_model.model.get_text_features(
|
||||
**hf_model.wrap_device(inputs)).squeeze(0)
|
||||
**hf_model.wrap_device(inputs)
|
||||
).squeeze(0)
|
||||
|
||||
all_outputs.append(pooled_output.tolist())
|
||||
|
||||
@@ -98,8 +100,7 @@ def test_models_image(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [
|
||||
(text, asset.pil_image)
|
||||
for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
@@ -125,11 +126,9 @@ def test_models_text_image_no_crash(
|
||||
texts = [HF_TEXT_PROMPTS[0]]
|
||||
images = [image_assets[0].pil_image]
|
||||
|
||||
with vllm_runner(model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
max_model_len=77) as vllm_model:
|
||||
with vllm_runner(
|
||||
model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=77
|
||||
) as vllm_model:
|
||||
with pytest.raises(ValueError, match="not both"):
|
||||
vllm_model.embed(texts, images=images)
|
||||
|
||||
|
||||
@@ -17,18 +17,21 @@ HF_TEXT_PROMPTS = [
|
||||
# T -> X
|
||||
(
|
||||
"Query: Find me an everyday image that matches the given caption: The label of the object is stop sign", # noqa: E501,
|
||||
Image.new("RGB", (56, 56))),
|
||||
Image.new("RGB", (56, 56)),
|
||||
),
|
||||
# T -> X
|
||||
("Query: Retrieve an image of this caption: cherry blossom",
|
||||
Image.new("RGB", (56, 56))),
|
||||
(
|
||||
"Query: Retrieve an image of this caption: cherry blossom",
|
||||
Image.new("RGB", (56, 56)),
|
||||
),
|
||||
]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"What is shown in this image?",
|
||||
"cherry_blossom":
|
||||
"What is shown in this image?"
|
||||
})
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "What is shown in this image?",
|
||||
"cherry_blossom": "What is shown in this image?",
|
||||
}
|
||||
)
|
||||
|
||||
MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]
|
||||
|
||||
@@ -36,34 +39,30 @@ MODELS = ["MrLight/dse-qwen2-2b-mrl-v1"]
|
||||
def get_messages(image: Image.Image, text: str, embed_text: bool):
|
||||
# assert False, 'remember to use outer [] as required'
|
||||
if embed_text:
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"image": Image.new("RGB", (56, 56)),
|
||||
"resized_height": 1,
|
||||
"resized_width": 1
|
||||
}, # need a dummy image here for an easier process.
|
||||
{
|
||||
"type": "text",
|
||||
"text": text
|
||||
},
|
||||
]
|
||||
}]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "image",
|
||||
"image": Image.new("RGB", (56, 56)),
|
||||
"resized_height": 1,
|
||||
"resized_width": 1,
|
||||
}, # need a dummy image here for an easier process.
|
||||
{"type": "text", "text": text},
|
||||
],
|
||||
}
|
||||
]
|
||||
else:
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "image",
|
||||
"image": image
|
||||
}, {
|
||||
"type": "text",
|
||||
"text": text
|
||||
}]
|
||||
}]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{"type": "image", "image": image},
|
||||
{"type": "text", "text": text},
|
||||
],
|
||||
}
|
||||
]
|
||||
return messages
|
||||
|
||||
|
||||
@@ -71,8 +70,10 @@ def apply_chat_template_and_add_eos(
|
||||
messages: list[dict],
|
||||
apply_chat_template_fn: Callable,
|
||||
):
|
||||
prompt = apply_chat_template_fn(
|
||||
messages, tokenize=False, add_generation_prompt=True) + "<|endoftext|>"
|
||||
prompt = (
|
||||
apply_chat_template_fn(messages, tokenize=False, add_generation_prompt=True)
|
||||
+ "<|endoftext|>"
|
||||
)
|
||||
return prompt
|
||||
|
||||
|
||||
@@ -86,16 +87,14 @@ def _run_test(
|
||||
*,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
'''SET PYTHONPATH'''
|
||||
"""SET PYTHONPATH"""
|
||||
# NOTE: take care of the order. run vLLM first, and then run HF.
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
max_model_len=8192) as vllm_model:
|
||||
with vllm_runner(
|
||||
model, runner="pooling", dtype=dtype, enforce_eager=True, max_model_len=8192
|
||||
) as vllm_model:
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
texts = [
|
||||
# this is necessary because vllm_model.embed will not apply any
|
||||
@@ -105,25 +104,25 @@ def _run_test(
|
||||
apply_chat_template_and_add_eos(
|
||||
get_messages(image, text, False),
|
||||
apply_chat_template_fn=tokenizer.apply_chat_template,
|
||||
) for text, image in zip(input_texts, input_images)
|
||||
)
|
||||
for text, image in zip(input_texts, input_images)
|
||||
# vllm will replace the pad token with the actual image,
|
||||
# which may be a placeholder image, later.
|
||||
]
|
||||
vllm_outputs = vllm_model.embed(texts, images=input_images)
|
||||
|
||||
hf_outputs = []
|
||||
with hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=Qwen2VLForConditionalGeneration) as hf_model:
|
||||
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=Qwen2VLForConditionalGeneration
|
||||
) as hf_model:
|
||||
prompts = []
|
||||
for text, image, embed_text in zip(input_texts, input_images,
|
||||
embed_texts):
|
||||
for text, image, embed_text in zip(input_texts, input_images, embed_texts):
|
||||
# dse requires non-standard input processing
|
||||
# because it needs an image_pad token
|
||||
messages = get_messages(image, text, embed_text)
|
||||
prompt = apply_chat_template_and_add_eos(
|
||||
messages, hf_model.processor.apply_chat_template)
|
||||
messages, hf_model.processor.apply_chat_template
|
||||
)
|
||||
|
||||
prompts.append(prompt)
|
||||
|
||||
@@ -145,9 +144,9 @@ def _run_test(
|
||||
return_dict=True,
|
||||
output_hidden_states=True,
|
||||
)
|
||||
pooled_output = F.normalize(outputs.hidden_states[-1][0, -1],
|
||||
p=2,
|
||||
dim=-1)
|
||||
pooled_output = F.normalize(
|
||||
outputs.hidden_states[-1][0, -1], p=2, dim=-1
|
||||
)
|
||||
|
||||
all_outputs.append(pooled_output.tolist())
|
||||
|
||||
@@ -170,8 +169,9 @@ def test_models_text(
|
||||
model: str,
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [(text, image_placeholder)
|
||||
for text, image_placeholder in HF_TEXT_PROMPTS]
|
||||
input_texts_images = [
|
||||
(text, image_placeholder) for text, image_placeholder in HF_TEXT_PROMPTS
|
||||
]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
embed_texts = [True] * len(input_texts)
|
||||
@@ -198,8 +198,7 @@ def test_models_image(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [
|
||||
(text, asset.pil_image)
|
||||
for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
@@ -29,7 +29,7 @@ def run_intern_vit_test(
|
||||
img_processor = CLIPImageProcessor.from_pretrained(model)
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
pixel_values = [
|
||||
img_processor(images, return_tensors='pt').pixel_values.to(torch_dtype)
|
||||
img_processor(images, return_tensors="pt").pixel_values.to(torch_dtype)
|
||||
for images in images
|
||||
]
|
||||
|
||||
@@ -37,15 +37,16 @@ def run_intern_vit_test(
|
||||
if not getattr(config, "norm_type", None):
|
||||
config.norm_type = "rms_norm"
|
||||
|
||||
hf_model = AutoModel.from_pretrained(model,
|
||||
torch_dtype=torch_dtype,
|
||||
trust_remote_code=True).to("cuda")
|
||||
hf_model = AutoModel.from_pretrained(
|
||||
model, torch_dtype=torch_dtype, trust_remote_code=True
|
||||
).to("cuda")
|
||||
hf_outputs_per_image = [
|
||||
hf_model(pixel_value.to("cuda")).last_hidden_state
|
||||
for pixel_value in pixel_values
|
||||
]
|
||||
|
||||
from vllm.model_executor.models.intern_vit import InternVisionModel
|
||||
|
||||
vllm_model = InternVisionModel(config)
|
||||
vllm_model.load_weights(hf_model.state_dict().items())
|
||||
|
||||
@@ -54,22 +55,23 @@ def run_intern_vit_test(
|
||||
|
||||
vllm_model = vllm_model.to("cuda", torch_dtype)
|
||||
vllm_outputs_per_image = [
|
||||
vllm_model(pixel_values=pixel_value.to("cuda"))
|
||||
for pixel_value in pixel_values
|
||||
vllm_model(pixel_values=pixel_value.to("cuda")) for pixel_value in pixel_values
|
||||
]
|
||||
del vllm_model
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
cos_similar = nn.CosineSimilarity(dim=-1)
|
||||
for vllm_output, hf_output in zip(vllm_outputs_per_image,
|
||||
hf_outputs_per_image):
|
||||
for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
|
||||
assert cos_similar(vllm_output, hf_output).mean() > 0.99
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", [
|
||||
"OpenGVLab/InternViT-300M-448px",
|
||||
"OpenGVLab/InternViT-6B-448px-V1-5",
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"model_id",
|
||||
[
|
||||
"OpenGVLab/InternViT-300M-448px",
|
||||
"OpenGVLab/InternViT-6B-448px-V1-5",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_models(dist_init, image_assets, model_id, dtype: str) -> None:
|
||||
run_intern_vit_test(
|
||||
|
||||
@@ -29,7 +29,6 @@ def vllm_reranker(
|
||||
query_type: str = "text",
|
||||
doc_type: str = "text",
|
||||
):
|
||||
|
||||
def create_image_param(url: str) -> ChatCompletionContentPartImageParam:
|
||||
return {"type": "image_url", "image_url": {"url": f"{url}"}}
|
||||
|
||||
@@ -38,23 +37,25 @@ def vllm_reranker(
|
||||
query = query_strs
|
||||
elif query_type == "image":
|
||||
query = ScoreMultiModalParam(
|
||||
content=[create_image_param(url) for url in query_strs])
|
||||
content=[create_image_param(url) for url in query_strs]
|
||||
)
|
||||
|
||||
documents: Union[list[str], ScoreMultiModalParam]
|
||||
if doc_type == "text":
|
||||
documents = document_strs
|
||||
elif doc_type == "image":
|
||||
documents = ScoreMultiModalParam(
|
||||
content=[create_image_param(url) for url in document_strs])
|
||||
content=[create_image_param(url) for url in document_strs]
|
||||
)
|
||||
|
||||
with vllm_runner(
|
||||
model_name,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_num_seqs=2,
|
||||
max_model_len=2048,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
model_name,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_num_seqs=2,
|
||||
max_model_len=2048,
|
||||
mm_processor_kwargs=mm_processor_kwargs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
) as vllm_model:
|
||||
outputs = vllm_model.llm.score(query, documents)
|
||||
|
||||
@@ -78,16 +79,15 @@ def hf_reranker(
|
||||
data_pairs = [[query_strs[0], d] for d in document_strs]
|
||||
|
||||
with hf_runner(
|
||||
model_name,
|
||||
dtype=dtype,
|
||||
trust_remote_code=True,
|
||||
auto_cls=AutoModel,
|
||||
model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
|
||||
model_name,
|
||||
dtype=dtype,
|
||||
trust_remote_code=True,
|
||||
auto_cls=AutoModel,
|
||||
model_kwargs={"key_mapping": checkpoint_to_hf_mapper},
|
||||
) as hf_model:
|
||||
return hf_model.model.compute_score(data_pairs,
|
||||
max_length=2048,
|
||||
query_type=query_type,
|
||||
doc_type=doc_type)
|
||||
return hf_model.model.compute_score(
|
||||
data_pairs, max_length=2048, query_type=query_type, doc_type=doc_type
|
||||
)
|
||||
|
||||
|
||||
# Visual Documents Reranking
|
||||
@@ -100,10 +100,12 @@ def test_model_text_image(hf_runner, vllm_runner, model_name, dtype):
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
|
||||
]
|
||||
|
||||
hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
|
||||
"text", "image")
|
||||
vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
|
||||
documents, "text", "image")
|
||||
hf_outputs = hf_reranker(
|
||||
hf_runner, model_name, dtype, query, documents, "text", "image"
|
||||
)
|
||||
vllm_outputs = vllm_reranker(
|
||||
vllm_runner, model_name, dtype, query, documents, "text", "image"
|
||||
)
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||
@@ -127,10 +129,12 @@ def test_model_text_text(hf_runner, vllm_runner, model_name, dtype):
|
||||
lower computational requirements.""", # noqa: E501
|
||||
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
|
||||
]
|
||||
hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
|
||||
"text", "text")
|
||||
vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
|
||||
documents, "text", "text")
|
||||
hf_outputs = hf_reranker(
|
||||
hf_runner, model_name, dtype, query, documents, "text", "text"
|
||||
)
|
||||
vllm_outputs = vllm_reranker(
|
||||
vllm_runner, model_name, dtype, query, documents, "text", "text"
|
||||
)
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||
@@ -157,10 +161,12 @@ def test_model_image_text(hf_runner, vllm_runner, model_name, dtype):
|
||||
"数据提取么?为什么不用正则啊,你用正则不就全解决了么?",
|
||||
]
|
||||
|
||||
hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
|
||||
"image", "text")
|
||||
vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
|
||||
documents, "image", "text")
|
||||
hf_outputs = hf_reranker(
|
||||
hf_runner, model_name, dtype, query, documents, "image", "text"
|
||||
)
|
||||
vllm_outputs = vllm_reranker(
|
||||
vllm_runner, model_name, dtype, query, documents, "image", "text"
|
||||
)
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||
@@ -178,10 +184,12 @@ def test_model_image_image(hf_runner, vllm_runner, model_name, dtype):
|
||||
"https://raw.githubusercontent.com/jina-ai/multimodal-reranker-test/main/paper-11.png",
|
||||
]
|
||||
|
||||
hf_outputs = hf_reranker(hf_runner, model_name, dtype, query, documents,
|
||||
"image", "image")
|
||||
vllm_outputs = vllm_reranker(vllm_runner, model_name, dtype, query,
|
||||
documents, "image", "image")
|
||||
hf_outputs = hf_reranker(
|
||||
hf_runner, model_name, dtype, query, documents, "image", "image"
|
||||
)
|
||||
vllm_outputs = vllm_reranker(
|
||||
vllm_runner, model_name, dtype, query, documents, "image", "image"
|
||||
)
|
||||
|
||||
assert hf_outputs[0] == pytest.approx(vllm_outputs[0], rel=0.02)
|
||||
assert hf_outputs[1] == pytest.approx(vllm_outputs[1], rel=0.02)
|
||||
|
||||
@@ -24,9 +24,10 @@ from ...utils import check_embeddings_close
|
||||
# built with LAPACK support.
|
||||
pytestmark = pytest.mark.skipif(
|
||||
not current_platform.is_cuda(),
|
||||
reason="Llava Next model uses op that is only supported in CUDA")
|
||||
reason="Llava Next model uses op that is only supported in CUDA",
|
||||
)
|
||||
|
||||
llama3_template = '<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n' # noqa: E501
|
||||
llama3_template = "<|start_header_id|>user<|end_header_id|>\n\n{}<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n \n" # noqa: E501
|
||||
|
||||
HF_TEXT_PROMPTS = [
|
||||
# T -> X
|
||||
@@ -34,18 +35,21 @@ HF_TEXT_PROMPTS = [
|
||||
"The label of the object is stop sign\nSummary above sentence in one word: " # noqa: E501
|
||||
),
|
||||
# T -> X
|
||||
llama3_template.format(
|
||||
"cherry blossom\nSummary above sentence in one word: "),
|
||||
llama3_template.format("cherry blossom\nSummary above sentence in one word: "),
|
||||
]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
# I -> X
|
||||
"stop_sign":
|
||||
llama3_template.format("<image>\nSummary above image in one word: "),
|
||||
# I -> X
|
||||
"cherry_blossom":
|
||||
llama3_template.format("<image>\nSummary above image in one word: "),
|
||||
})
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
# I -> X
|
||||
"stop_sign": llama3_template.format(
|
||||
"<image>\nSummary above image in one word: "
|
||||
),
|
||||
# I -> X
|
||||
"cherry_blossom": llama3_template.format(
|
||||
"<image>\nSummary above image in one word: "
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
MODELS = ["royokong/e5-v"]
|
||||
|
||||
@@ -63,23 +67,22 @@ def _run_test(
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(model,
|
||||
runner="pooling",
|
||||
dtype=dtype,
|
||||
max_model_len=4096,
|
||||
enforce_eager=True) as vllm_model:
|
||||
with vllm_runner(
|
||||
model, runner="pooling", dtype=dtype, max_model_len=4096, enforce_eager=True
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
auto_cls=AutoModelForImageTextToText) as hf_model:
|
||||
with hf_runner(
|
||||
model, dtype=dtype, auto_cls=AutoModelForImageTextToText
|
||||
) as hf_model:
|
||||
# Patch the issue where generation_config.json is missing
|
||||
hf_model.processor.patch_size = \
|
||||
hf_model.model.config.vision_config.patch_size
|
||||
hf_model.processor.patch_size = hf_model.model.config.vision_config.patch_size
|
||||
|
||||
# Patch the issue where image_token_id
|
||||
# exceeds the maximum allowed vocab size
|
||||
hf_model.model.resize_token_embeddings(
|
||||
hf_model.model.language_model.vocab_size + 1)
|
||||
hf_model.model.language_model.vocab_size + 1
|
||||
)
|
||||
|
||||
all_inputs = hf_model.get_inputs(input_texts, images=input_images)
|
||||
|
||||
@@ -91,8 +94,7 @@ def _run_test(
|
||||
return_dict=True,
|
||||
output_hidden_states=True,
|
||||
)
|
||||
pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :],
|
||||
dim=-1)
|
||||
pooled_output = F.normalize(outputs.hidden_states[-1][0, -1, :], dim=-1)
|
||||
|
||||
all_outputs.append(pooled_output.tolist())
|
||||
|
||||
@@ -142,8 +144,7 @@ def test_models_image(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [
|
||||
(text, asset.pil_image)
|
||||
for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
@@ -19,14 +19,14 @@ HF_TEXT_PROMPTS = [
|
||||
"Retrieve an image of this caption: cherry blossom",
|
||||
]
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
# T + I -> X
|
||||
"stop_sign":
|
||||
"<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign", # noqa: E501
|
||||
# I -> X
|
||||
"cherry_blossom":
|
||||
"<|image_1|> Represent the given image for classification", # noqa: E501
|
||||
})
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
# T + I -> X
|
||||
"stop_sign": "<|image_1|> Select the portion of the image that isolates the object of the given label: The label of the object is stop sign", # noqa: E501
|
||||
# I -> X
|
||||
"cherry_blossom": "<|image_1|> Represent the given image for classification", # noqa: E501
|
||||
}
|
||||
)
|
||||
|
||||
MODELS = ["TIGER-Lab/VLM2Vec-Full"]
|
||||
|
||||
@@ -44,14 +44,14 @@ def _run_test(
|
||||
# vLLM needs a fresh new process without cuda initialization.
|
||||
# if we run HF first, the cuda initialization will be done and it
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
with vllm_runner(model, runner="pooling", dtype=dtype,
|
||||
enforce_eager=True) as vllm_model:
|
||||
with vllm_runner(
|
||||
model, runner="pooling", dtype=dtype, enforce_eager=True
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.embed(input_texts, images=input_images)
|
||||
|
||||
# use eager mode for hf runner, since phi3_v didn't work with flash_attn
|
||||
hf_model_kwargs = {"_attn_implementation": "eager"}
|
||||
with hf_runner(model, dtype=dtype,
|
||||
model_kwargs=hf_model_kwargs) as hf_model:
|
||||
with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
|
||||
all_inputs = hf_model.get_inputs(input_texts, images=input_images)
|
||||
|
||||
all_outputs = []
|
||||
@@ -114,18 +114,21 @@ def test_models_image(
|
||||
dtype: str,
|
||||
) -> None:
|
||||
input_texts_images = [
|
||||
(text, asset.pil_image)
|
||||
for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
(text, asset.pil_image) for text, asset in zip(HF_IMAGE_PROMPTS, image_assets)
|
||||
]
|
||||
# add cases for special_tokens
|
||||
input_texts_images.append((
|
||||
"\n<s><|user|>\n <|image_1|>\n\t <s>"
|
||||
"Represent the given image for classification<|end|>"
|
||||
"\n<|assistant|>\n",
|
||||
Image.open(
|
||||
get_vllm_public_assets(filename="cherry_blossom.jpg",
|
||||
s3_prefix=VLM_IMAGES_DIR)),
|
||||
))
|
||||
input_texts_images.append(
|
||||
(
|
||||
"\n<s><|user|>\n <|image_1|>\n\t <s>"
|
||||
"Represent the given image for classification<|end|>"
|
||||
"\n<|assistant|>\n",
|
||||
Image.open(
|
||||
get_vllm_public_assets(
|
||||
filename="cherry_blossom.jpg", s3_prefix=VLM_IMAGES_DIR
|
||||
)
|
||||
),
|
||||
)
|
||||
)
|
||||
input_texts = [text for text, _ in input_texts_images]
|
||||
input_images = [image for _, image in input_texts_images]
|
||||
|
||||
|
||||
@@ -19,25 +19,25 @@ def _run_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
model: str,
|
||||
) -> None:
|
||||
|
||||
prompt = [
|
||||
{
|
||||
# This model deals with no text input
|
||||
"prompt_token_ids": [1],
|
||||
"multi_modal_data": generate_test_mm_data(),
|
||||
} for _ in range(10)
|
||||
}
|
||||
for _ in range(10)
|
||||
]
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype="half",
|
||||
enforce_eager=True,
|
||||
skip_tokenizer_init=True,
|
||||
# Limit the maximum number of sequences to avoid the
|
||||
# test going OOM during the warmup run
|
||||
max_num_seqs=32,
|
||||
default_torch_num_threads=1,
|
||||
model,
|
||||
runner="pooling",
|
||||
dtype="half",
|
||||
enforce_eager=True,
|
||||
skip_tokenizer_init=True,
|
||||
# Limit the maximum number of sequences to avoid the
|
||||
# test going OOM during the warmup run
|
||||
max_num_seqs=32,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
vllm_model.encode(prompt)
|
||||
|
||||
|
||||
@@ -34,9 +34,9 @@ def run_radio_test(
|
||||
# Using `self.get_nearest_supported_resolution`, for assets 432x642 the
|
||||
# nearest supported resolution is 432x640.
|
||||
pixel_values = [
|
||||
img_processor(
|
||||
image,
|
||||
return_tensors='pt').pixel_values.to(torch_dtype)[:, :, :, :640]
|
||||
img_processor(image, return_tensors="pt").pixel_values.to(torch_dtype)[
|
||||
:, :, :, :640
|
||||
]
|
||||
for image in images
|
||||
]
|
||||
|
||||
@@ -51,32 +51,33 @@ def run_radio_test(
|
||||
hf_model.eval()
|
||||
|
||||
hf_outputs_per_image = [
|
||||
hf_model(pixel_value.to("cuda")).features
|
||||
for pixel_value in pixel_values
|
||||
hf_model(pixel_value.to("cuda")).features for pixel_value in pixel_values
|
||||
]
|
||||
|
||||
radio_config = RadioConfig(model_name=config.args["model"],
|
||||
reg_tokens=config.args["register_multiple"])
|
||||
radio_config = RadioConfig(
|
||||
model_name=config.args["model"], reg_tokens=config.args["register_multiple"]
|
||||
)
|
||||
vllm_model = RadioModel(radio_config)
|
||||
vllm_model.load_weights(hf_model.state_dict())
|
||||
vllm_model = vllm_model.to("cuda", torch_dtype)
|
||||
|
||||
vllm_outputs_per_image = [
|
||||
vllm_model(pixel_values=pixel_value.to("cuda"))
|
||||
for pixel_value in pixel_values
|
||||
vllm_model(pixel_values=pixel_value.to("cuda")) for pixel_value in pixel_values
|
||||
]
|
||||
del vllm_model, hf_model
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
cos_similar = nn.CosineSimilarity(dim=-1)
|
||||
for vllm_output, hf_output in zip(vllm_outputs_per_image,
|
||||
hf_outputs_per_image):
|
||||
for vllm_output, hf_output in zip(vllm_outputs_per_image, hf_outputs_per_image):
|
||||
assert cos_similar(vllm_output, hf_output).mean() > 0.99
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", [
|
||||
"nvidia/C-RADIOv2-H",
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"model_id",
|
||||
[
|
||||
"nvidia/C-RADIOv2-H",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
def test_radio(dist_init, image_assets, model_id, dtype: str) -> None:
|
||||
run_radio_test(
|
||||
|
||||
@@ -6,22 +6,27 @@ from typing import Optional, Union
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
|
||||
UserMessage)
|
||||
from mistral_common.protocol.instruct.messages import ImageChunk, TextChunk, UserMessage
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.config import ModelConfig
|
||||
from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
|
||||
ImageDummyOptions, VideoDummyOptions)
|
||||
from vllm.config.multimodal import (
|
||||
AudioDummyOptions,
|
||||
BaseDummyOptions,
|
||||
ImageDummyOptions,
|
||||
VideoDummyOptions,
|
||||
)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, MultiModalDataDict
|
||||
from vllm.multimodal.cache import MultiModalProcessorOnlyCache
|
||||
from vllm.multimodal.inputs import MultiModalInputs
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
InputProcessingContext)
|
||||
from vllm.transformers_utils.tokenizer import (AnyTokenizer, MistralTokenizer,
|
||||
cached_tokenizer_from_config,
|
||||
encode_tokens)
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
||||
from vllm.transformers_utils.tokenizer import (
|
||||
AnyTokenizer,
|
||||
MistralTokenizer,
|
||||
cached_tokenizer_from_config,
|
||||
encode_tokens,
|
||||
)
|
||||
|
||||
from ....multimodal.utils import random_audio, random_image, random_video
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
@@ -36,14 +41,17 @@ def glm4_1v_patch_mm_data(mm_data: MultiModalDataDict) -> MultiModalDataDict:
|
||||
# GLM4.1V doesn't support multiple videos
|
||||
video = mm_data["video"]
|
||||
num_frames = len(video)
|
||||
mm_data["video"] = (video, {
|
||||
"total_num_frames": num_frames,
|
||||
"fps": num_frames,
|
||||
"duration": 1,
|
||||
"frames_indices": [i for i in range(num_frames)],
|
||||
"video_backend": "opencv",
|
||||
"do_sample_frames": True,
|
||||
})
|
||||
mm_data["video"] = (
|
||||
video,
|
||||
{
|
||||
"total_num_frames": num_frames,
|
||||
"fps": num_frames,
|
||||
"duration": 1,
|
||||
"frames_indices": [i for i in range(num_frames)],
|
||||
"video_backend": "opencv",
|
||||
"do_sample_frames": True,
|
||||
},
|
||||
)
|
||||
return mm_data
|
||||
|
||||
|
||||
@@ -102,7 +110,8 @@ def _test_processing_correctness(
|
||||
mm_processor_cache_gb=2048,
|
||||
skip_tokenizer_init=model_info.skip_tokenizer_init,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype)
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
factories = MULTIMODAL_REGISTRY._processor_factories[model_cls]
|
||||
@@ -145,27 +154,22 @@ def _test_processing_correctness(
|
||||
input_to_hit = {
|
||||
"image": Image.new("RGB", size=(128, 128)),
|
||||
"video": np.zeros((4, 128, 128, 3), dtype=np.uint8),
|
||||
"audio": (np.zeros((512, )), 16000),
|
||||
"audio": (np.zeros((512,)), 16000),
|
||||
}
|
||||
input_factory = {
|
||||
"image":
|
||||
partial(random_image, rng, min_wh=128, max_wh=256),
|
||||
"video":
|
||||
partial(random_video,
|
||||
rng,
|
||||
min_frames=2,
|
||||
max_frames=16,
|
||||
min_wh=128,
|
||||
max_wh=256),
|
||||
"audio":
|
||||
partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
|
||||
"image": partial(random_image, rng, min_wh=128, max_wh=256),
|
||||
"video": partial(
|
||||
random_video, rng, min_frames=2, max_frames=16, min_wh=128, max_wh=256
|
||||
),
|
||||
"audio": partial(random_audio, rng, min_len=512, max_len=1024, sr=16000),
|
||||
}
|
||||
|
||||
for batch_idx in range(num_batches):
|
||||
mm_data = {
|
||||
k:
|
||||
[(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
|
||||
for _ in range(rng.randint(limit + 1))]
|
||||
k: [
|
||||
(input_to_hit[k] if rng.rand() < hit_rate else input_factory[k]())
|
||||
for _ in range(rng.randint(limit + 1))
|
||||
]
|
||||
for k, limit in limit_mm_per_prompt_ints.items()
|
||||
}
|
||||
|
||||
@@ -174,12 +178,16 @@ def _test_processing_correctness(
|
||||
# Mistral chat outputs tokens directly, rather than text prompts
|
||||
if isinstance(tokenizer, MistralTokenizer):
|
||||
images = mm_data.get("image", [])
|
||||
request = ChatCompletionRequest(messages=[
|
||||
UserMessage(content=[
|
||||
TextChunk(text=""),
|
||||
*(ImageChunk(image=image) for image in images),
|
||||
]),
|
||||
])
|
||||
request = ChatCompletionRequest(
|
||||
messages=[
|
||||
UserMessage(
|
||||
content=[
|
||||
TextChunk(text=""),
|
||||
*(ImageChunk(image=image) for image in images),
|
||||
]
|
||||
),
|
||||
]
|
||||
)
|
||||
res = tokenizer.mistral.encode_chat_completion(request)
|
||||
prompt = res.tokens
|
||||
else:
|
||||
@@ -303,16 +311,14 @@ def _test_processing_correctness_one(
|
||||
baseline_text_result,
|
||||
baseline_tokenized_result,
|
||||
ignore_mm_keys=ignore_mm_keys,
|
||||
msg=f"Failed ({batch_idx=}, {text_prompt=}, "
|
||||
f"{token_prompt=}, {mm_data=})",
|
||||
msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
|
||||
)
|
||||
|
||||
_assert_inputs_equal(
|
||||
cached_text_result,
|
||||
cached_tokenized_result,
|
||||
ignore_mm_keys=ignore_mm_keys,
|
||||
msg=f"Failed ({batch_idx=}, {text_prompt=}, "
|
||||
f"{token_prompt=}, {mm_data=})",
|
||||
msg=f"Failed ({batch_idx=}, {text_prompt=}, {token_prompt=}, {mm_data=})",
|
||||
)
|
||||
|
||||
|
||||
|
||||
@@ -24,7 +24,8 @@ from ...utils import build_model_context
|
||||
# post-sampled frames (expected behavior)
|
||||
(-1, 1, 5),
|
||||
(-1, 2, 10),
|
||||
])
|
||||
],
|
||||
)
|
||||
def test_processor_override(
|
||||
model_id: str,
|
||||
expected_toks_per_frame: int,
|
||||
@@ -55,10 +56,8 @@ def test_processor_override(
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
video_token_id = tokenizer.convert_tokens_to_ids(hf_processor.video_token)
|
||||
video_tok_count = processed_inputs["prompt_token_ids"].count(
|
||||
video_token_id)
|
||||
grid_t, _, _ = processed_inputs["mm_kwargs"].get_data(
|
||||
)["video_grid_thw"][0]
|
||||
video_tok_count = processed_inputs["prompt_token_ids"].count(video_token_id)
|
||||
grid_t, _, _ = processed_inputs["mm_kwargs"].get_data()["video_grid_thw"][0]
|
||||
|
||||
assert grid_t == expected_grid_t
|
||||
assert video_tok_count == expected_toks_per_frame * grid_t
|
||||
@@ -71,7 +70,7 @@ def test_video_loader_consistency(
|
||||
fps: int,
|
||||
):
|
||||
"""
|
||||
Ensure dynamic video loader (pre-sampled by loader) and normal video
|
||||
Ensure dynamic video loader (pre-sampled by loader) and normal video
|
||||
loader (post-sampled by processor) produce same video processing outputs.
|
||||
"""
|
||||
ctx = build_model_context(
|
||||
@@ -91,7 +90,8 @@ def test_video_loader_consistency(
|
||||
|
||||
static_video, static_metadata = OpenCVVideoBackend.load_bytes(video_bytes)
|
||||
dynamic_video, dynamic_metadata = OpenCVDynamicVideoBackend.load_bytes(
|
||||
video_bytes, fps=fps)
|
||||
video_bytes, fps=fps
|
||||
)
|
||||
|
||||
# pre-sampled loader shouldn't read all frames
|
||||
assert len(dynamic_video) < len(static_video)
|
||||
@@ -99,12 +99,11 @@ def test_video_loader_consistency(
|
||||
static_mm_data = {"video": [(static_video, static_metadata)]}
|
||||
dynamic_mm_data = {"video": [(dynamic_video, dynamic_metadata)]}
|
||||
|
||||
static_outputs = processor.apply(prompt, static_mm_data,
|
||||
hf_processor_mm_kwargs)
|
||||
dynamic_outputs = processor.apply(prompt, dynamic_mm_data,
|
||||
hf_processor_mm_kwargs)
|
||||
static_outputs = processor.apply(prompt, static_mm_data, hf_processor_mm_kwargs)
|
||||
dynamic_outputs = processor.apply(prompt, dynamic_mm_data, hf_processor_mm_kwargs)
|
||||
|
||||
assert static_outputs["prompt_token_ids"] == dynamic_outputs[
|
||||
"prompt_token_ids"]
|
||||
assert static_outputs["mm_kwargs"].get_data(
|
||||
) == dynamic_outputs["mm_kwargs"].get_data()
|
||||
assert static_outputs["prompt_token_ids"] == dynamic_outputs["prompt_token_ids"]
|
||||
assert (
|
||||
static_outputs["mm_kwargs"].get_data()
|
||||
== dynamic_outputs["mm_kwargs"].get_data()
|
||||
)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for H2OVL's multimodal preprocessing kwargs."""
|
||||
|
||||
from collections.abc import Mapping
|
||||
from typing import Optional
|
||||
|
||||
@@ -23,8 +24,10 @@ def _get_expected_num_patches(
|
||||
min_num: int,
|
||||
max_num: int,
|
||||
):
|
||||
from vllm.model_executor.models.h2ovl import (calculate_h2ovl_targets,
|
||||
get_h2ovl_target_ratios)
|
||||
from vllm.model_executor.models.h2ovl import (
|
||||
calculate_h2ovl_targets,
|
||||
get_h2ovl_target_ratios,
|
||||
)
|
||||
|
||||
width, height = image.size
|
||||
|
||||
@@ -101,24 +104,27 @@ def _run_check(
|
||||
|
||||
total_expected_num_patches = sum(
|
||||
_get_expected_num_patches(config, image, len(images), min_num, max_num)
|
||||
for image in images)
|
||||
for image in images
|
||||
)
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data(
|
||||
)["pixel_values_flat"].shape
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
|
||||
|
||||
assert img_tok_count == 256 * total_expected_num_patches
|
||||
assert pixel_shape[0] == total_expected_num_patches
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id", [
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
"h2oai/h2ovl-mississippi-2b",
|
||||
])
|
||||
@pytest.mark.parametrize(
|
||||
"model_id",
|
||||
[
|
||||
"h2oai/h2ovl-mississippi-800m",
|
||||
"h2oai/h2ovl-mississippi-2b",
|
||||
],
|
||||
)
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
@@ -165,10 +171,7 @@ def test_processor_override(
|
||||
|
||||
_run_check(
|
||||
processor,
|
||||
[
|
||||
rescale_image_size(image_assets[0].pil_image, f)
|
||||
for f in size_factors
|
||||
],
|
||||
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
|
||||
min_num,
|
||||
max_num,
|
||||
hf_processor_mm_kwargs,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for Idefics3's multimodal preprocessing kwargs."""
|
||||
|
||||
import pytest
|
||||
from transformers import Idefics3Config
|
||||
|
||||
@@ -17,7 +18,8 @@ from ...utils import build_model_context
|
||||
[
|
||||
({"size": {"longest_edge": 364}}, 169),
|
||||
({"size": {"longest_edge": 728}}, 169 * (2**2 + 1)),
|
||||
])
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
@@ -42,8 +44,11 @@ def test_processor_override(
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
placeholders = "<image>" if num_imgs == 1 else "\n".join(
|
||||
f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
|
||||
placeholders = (
|
||||
"<image>"
|
||||
if num_imgs == 1
|
||||
else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
|
||||
)
|
||||
prompt = f"<|begin_of_text|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
|
||||
|
||||
# Build mm_data
|
||||
@@ -57,8 +62,7 @@ def test_processor_override(
|
||||
# Ensure the placeholders format are correct
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
|
||||
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
|
||||
"input_ids"][0]
|
||||
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = ctx.get_hf_config().image_token_id
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for InternVL's multimodal preprocessing kwargs."""
|
||||
|
||||
from collections.abc import Mapping
|
||||
from typing import Optional
|
||||
|
||||
@@ -24,7 +25,9 @@ def _get_expected_num_patches(
|
||||
max_num: int,
|
||||
):
|
||||
from vllm.model_executor.models.internvl import (
|
||||
calculate_internvl_targets, get_internvl_target_ratios)
|
||||
calculate_internvl_targets,
|
||||
get_internvl_target_ratios,
|
||||
)
|
||||
|
||||
width, height = image.size
|
||||
|
||||
@@ -61,15 +64,15 @@ def _run_check(
|
||||
|
||||
total_expected_num_patches = sum(
|
||||
_get_expected_num_patches(config, image, len(images), min_num, max_num)
|
||||
for image in images)
|
||||
for image in images
|
||||
)
|
||||
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data(
|
||||
)["pixel_values_flat"].shape
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
|
||||
|
||||
assert img_tok_count == 256 * total_expected_num_patches
|
||||
assert pixel_shape[0] == total_expected_num_patches
|
||||
@@ -122,10 +125,7 @@ def test_processor_override(
|
||||
|
||||
_run_check(
|
||||
processor,
|
||||
[
|
||||
rescale_image_size(image_assets[0].pil_image, f)
|
||||
for f in size_factors
|
||||
],
|
||||
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
|
||||
min_num,
|
||||
max_num,
|
||||
hf_processor_mm_kwargs,
|
||||
|
||||
@@ -11,8 +11,7 @@ from ....conftest import ImageTestAssets
|
||||
from ...utils import build_model_context
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
|
||||
@pytest.mark.parametrize("model_id", ["meta-llama/Llama-4-Scout-17B-16E-Instruct"])
|
||||
@pytest.mark.parametrize("mm_processor_kwargs", [{}])
|
||||
@pytest.mark.parametrize("num_imgs", [1, 5])
|
||||
@pytest.mark.parametrize("mm_processor_cache_gb", [0, 4])
|
||||
@@ -38,13 +37,14 @@ def test_processor_override(
|
||||
hf_processor = processor.info.get_hf_processor()
|
||||
vocab = tokenizer.get_vocab()
|
||||
|
||||
prompt = "<|begin_of_text|><|header_start|>user<|header_end|>" \
|
||||
+ "<|image|>" * num_imgs \
|
||||
prompt = (
|
||||
"<|begin_of_text|><|header_start|>user<|header_end|>"
|
||||
+ "<|image|>" * num_imgs
|
||||
+ "<|eot|><|header_start|>assistant<|header_end|>"
|
||||
)
|
||||
mm_data = {
|
||||
"image": [
|
||||
image_assets[(i % len(image_assets))].pil_image
|
||||
for i in range(num_imgs)
|
||||
image_assets[(i % len(image_assets))].pil_image for i in range(num_imgs)
|
||||
]
|
||||
}
|
||||
if tokenized_prompt:
|
||||
@@ -64,22 +64,23 @@ def test_processor_override(
|
||||
if tiles_x * tiles_y > 1:
|
||||
num_x_separators += (tiles_x - 1) * tiles_y
|
||||
num_y_separators += tiles_y
|
||||
assert prompt_token_ids.count(vocab[hf_processor.tile_token]) \
|
||||
== num_x_separators
|
||||
assert prompt_token_ids.count(vocab[hf_processor.tile_global_token]) \
|
||||
== num_y_separators
|
||||
assert prompt_token_ids.count(vocab[hf_processor.tile_token]) == num_x_separators
|
||||
assert (
|
||||
prompt_token_ids.count(vocab[hf_processor.tile_global_token])
|
||||
== num_y_separators
|
||||
)
|
||||
|
||||
# image token offsets
|
||||
img_locs = processed_inputs["mm_placeholders"].get("image", [])
|
||||
assert len(img_locs) == num_imgs
|
||||
assert [img_loc.offset for img_loc in img_locs] == \
|
||||
[i for i, v in enumerate(prompt_token_ids) \
|
||||
if v == config.boi_token_index]
|
||||
assert [img_loc.offset for img_loc in img_locs] == [
|
||||
i for i, v in enumerate(prompt_token_ids) if v == config.boi_token_index
|
||||
]
|
||||
|
||||
# patch sizes and masks
|
||||
num_patches_per_chunk = processor.info.get_patch_per_chunk(
|
||||
config.vision_config)
|
||||
assert prompt_token_ids.count(config.image_token_index) \
|
||||
num_patches_per_chunk = processor.info.get_patch_per_chunk(config.vision_config)
|
||||
assert (
|
||||
prompt_token_ids.count(config.image_token_index)
|
||||
== sum(mm_data["patches_per_image"]) * num_patches_per_chunk
|
||||
assert len(mm_data["pixel_values"]) \
|
||||
== sum(mm_data["patches_per_image"])
|
||||
)
|
||||
assert len(mm_data["pixel_values"]) == sum(mm_data["patches_per_image"])
|
||||
|
||||
@@ -22,8 +22,9 @@ def _validate_image_max_tokens_one(
|
||||
image_size: ImageSize,
|
||||
) -> None:
|
||||
info = processor.info
|
||||
feature_size = info.get_num_image_tokens(image_width=image_size.width,
|
||||
image_height=image_size.height)
|
||||
feature_size = info.get_num_image_tokens(
|
||||
image_width=image_size.width, image_height=image_size.height
|
||||
)
|
||||
|
||||
try:
|
||||
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
|
||||
@@ -31,8 +32,9 @@ def _validate_image_max_tokens_one(
|
||||
failed_size_excs.append((image_size, exc))
|
||||
|
||||
|
||||
@pytest.mark.skip("This test takes around 5 minutes to run. "
|
||||
"Comment this out to run it manually.")
|
||||
@pytest.mark.skip(
|
||||
"This test takes around 5 minutes to run. Comment this out to run it manually."
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||
def test_processor_max_tokens(model_id):
|
||||
ctx = build_model_context(
|
||||
@@ -66,9 +68,9 @@ def test_processor_max_tokens(model_id):
|
||||
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" \
|
||||
+ "\n========\n".join(f"[{size}]\n{exc}"
|
||||
for size, exc in failed_size_excs)
|
||||
msg = "Found failing image sizes:" + "\n========\n".join(
|
||||
f"[{size}]\n{exc}" for size, exc in failed_size_excs
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
@@ -94,8 +96,10 @@ def _validate_image_prompt_replacements_one(
|
||||
|
||||
# NOTE: There is a BOS token
|
||||
assert first_placeholder.offset == 1
|
||||
assert first_placeholder.length == (
|
||||
len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
|
||||
assert (
|
||||
first_placeholder.length
|
||||
== (len(processed_inputs["prompt_token_ids"]) - 1) // num_imgs
|
||||
)
|
||||
|
||||
except Exception as exc:
|
||||
failed_size_excs.append((image_size, exc))
|
||||
@@ -122,9 +126,9 @@ def _test_image_prompt_replacements(
|
||||
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" \
|
||||
+ "\n========\n".join(f"[{size}]\n{exc}"
|
||||
for size, exc in failed_size_excs)
|
||||
msg = "Found failing image sizes:" + "\n========\n".join(
|
||||
f"[{size}]\n{exc}" for size, exc in failed_size_excs
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
@@ -138,11 +142,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
|
||||
(488, 183), (2560, 1669)]
|
||||
image_ratios = [
|
||||
(171, 152),
|
||||
(184, 161),
|
||||
(198, 176),
|
||||
(333, 296),
|
||||
(369, 328),
|
||||
(488, 183),
|
||||
(2560, 1669),
|
||||
]
|
||||
image_sizes = [
|
||||
size for w, h in image_ratios
|
||||
for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
]
|
||||
|
||||
_test_image_prompt_replacements(
|
||||
@@ -152,8 +162,9 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip("This test takes around 2 hours to run. "
|
||||
"Comment this out to run it manually.")
|
||||
@pytest.mark.skip(
|
||||
"This test takes around 2 hours to run. Comment this out to run it manually."
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-v1.6-mistral-7b-hf"])
|
||||
@pytest.mark.parametrize("num_imgs", [1])
|
||||
def test_processor_prompt_replacements_all(model_id, num_imgs):
|
||||
|
||||
@@ -22,8 +22,9 @@ def _validate_image_max_tokens_one(
|
||||
image_size: ImageSize,
|
||||
) -> None:
|
||||
info = processor.info
|
||||
feature_size = info.get_num_image_tokens(image_width=image_size.width,
|
||||
image_height=image_size.height)
|
||||
feature_size = info.get_num_image_tokens(
|
||||
image_width=image_size.width, image_height=image_size.height
|
||||
)
|
||||
|
||||
try:
|
||||
assert feature_size <= max_tokens, f"{feature_size} <= {max_tokens}"
|
||||
@@ -31,10 +32,10 @@ def _validate_image_max_tokens_one(
|
||||
failed_size_excs.append((image_size, exc))
|
||||
|
||||
|
||||
@pytest.mark.skip("This test takes around 5 minutes to run. "
|
||||
"Comment this out to run it manually.")
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
@pytest.mark.skip(
|
||||
"This test takes around 5 minutes to run. Comment this out to run it manually."
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
def test_processor_max_tokens(model_id):
|
||||
ctx = build_model_context(
|
||||
model_id,
|
||||
@@ -67,9 +68,9 @@ def test_processor_max_tokens(model_id):
|
||||
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" \
|
||||
+ "\n========\n".join(f"[{size}]\n{exc}"
|
||||
for size, exc in failed_size_excs)
|
||||
msg = "Found failing image sizes:" + "\n========\n".join(
|
||||
f"[{size}]\n{exc}" for size, exc in failed_size_excs
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
@@ -94,8 +95,10 @@ def _validate_image_prompt_replacements_one(
|
||||
first_placeholder = image_placeholders[0]
|
||||
|
||||
assert first_placeholder.offset == 0
|
||||
assert first_placeholder.length == len(
|
||||
processed_inputs["prompt_token_ids"]) // num_imgs
|
||||
assert (
|
||||
first_placeholder.length
|
||||
== len(processed_inputs["prompt_token_ids"]) // num_imgs
|
||||
)
|
||||
except Exception as exc:
|
||||
failed_size_excs.append((image_size, exc))
|
||||
|
||||
@@ -121,14 +124,13 @@ def _test_image_prompt_replacements(
|
||||
pqdm(image_sizes, validate_one, n_jobs=8, desc="Validating image sizes")
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" \
|
||||
+ "\n========\n".join(f"[{size}]\n{exc}"
|
||||
for size, exc in failed_size_excs)
|
||||
msg = "Found failing image sizes:" + "\n========\n".join(
|
||||
f"[{size}]\n{exc}" for size, exc in failed_size_excs
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
@@ -138,11 +140,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
|
||||
(488, 183), (2560, 1669)]
|
||||
image_ratios = [
|
||||
(171, 152),
|
||||
(184, 161),
|
||||
(198, 176),
|
||||
(333, 296),
|
||||
(369, 328),
|
||||
(488, 183),
|
||||
(2560, 1669),
|
||||
]
|
||||
image_sizes = [
|
||||
size for w, h in image_ratios
|
||||
for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
]
|
||||
|
||||
_test_image_prompt_replacements(
|
||||
@@ -152,10 +160,10 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.skip("This test takes around 2 hours to run. "
|
||||
"Comment this out to run it manually.")
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
@pytest.mark.skip(
|
||||
"This test takes around 2 hours to run. Comment this out to run it manually."
|
||||
)
|
||||
@pytest.mark.parametrize("model_id", ["llava-hf/llava-onevision-qwen2-0.5b-ov-hf"])
|
||||
@pytest.mark.parametrize("num_imgs", [1])
|
||||
def test_processor_prompt_replacements_all(model_id, num_imgs):
|
||||
ctx = build_model_context(
|
||||
|
||||
@@ -61,17 +61,17 @@ def _test_image_prompt_replacements(
|
||||
num_imgs: int,
|
||||
image_sizes: list[ImageSize],
|
||||
) -> None:
|
||||
|
||||
failed_size_excs = list[tuple[ImageSize, Exception]]()
|
||||
|
||||
for size in image_sizes:
|
||||
_validate_image_prompt_replacements_one(processor, num_imgs,
|
||||
failed_size_excs, size)
|
||||
_validate_image_prompt_replacements_one(
|
||||
processor, num_imgs, failed_size_excs, size
|
||||
)
|
||||
|
||||
if failed_size_excs:
|
||||
msg = "Found failing image sizes:" \
|
||||
+ "\n========\n".join(f"[{size}]\n{exc}"
|
||||
for size, exc in failed_size_excs)
|
||||
msg = "Found failing image sizes:" + "\n========\n".join(
|
||||
f"[{size}]\n{exc}" for size, exc in failed_size_excs
|
||||
)
|
||||
raise AssertionError(msg)
|
||||
|
||||
|
||||
@@ -85,11 +85,17 @@ def test_processor_prompt_replacements_regression(model_id, num_imgs):
|
||||
)
|
||||
processor = MULTIMODAL_REGISTRY.create_processor(ctx.model_config)
|
||||
|
||||
image_ratios = [(171, 152), (184, 161), (198, 176), (333, 296), (369, 328),
|
||||
(488, 183), (2560, 1669)]
|
||||
image_ratios = [
|
||||
(171, 152),
|
||||
(184, 161),
|
||||
(198, 176),
|
||||
(333, 296),
|
||||
(369, 328),
|
||||
(488, 183),
|
||||
(2560, 1669),
|
||||
]
|
||||
image_sizes = [
|
||||
size for w, h in image_ratios
|
||||
for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
size for w, h in image_ratios for size in [ImageSize(w, h), ImageSize(h, w)]
|
||||
]
|
||||
|
||||
_test_image_prompt_replacements(
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for mllama's multimodal preprocessing and profiling."""
|
||||
|
||||
import pytest
|
||||
from torch import prod
|
||||
from transformers import Llama4Config
|
||||
@@ -47,14 +48,17 @@ def test_profiling(model_id: str, max_model_len: int):
|
||||
image_size = hf_config.vision_config.image_size
|
||||
patch_size = hf_config.vision_config.patch_size
|
||||
downsample_ratio = int(
|
||||
round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2)))
|
||||
tokens_per_patch = ((image_size // patch_size)**2) // downsample_ratio
|
||||
round(1.0 / (hf_config.vision_config.pixel_shuffle_ratio**2))
|
||||
)
|
||||
tokens_per_patch = ((image_size // patch_size) ** 2) // downsample_ratio
|
||||
chunks_per_image = prod(mm_data["patches_per_image"])
|
||||
total_num_patches = chunks_per_image * tokens_per_patch
|
||||
num_tiles = mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][
|
||||
1] # x-y separator tokens
|
||||
total_tokens = total_num_patches.item() + num_tiles.item(
|
||||
) + 3 # image start, image, image end
|
||||
num_tiles = (
|
||||
mm_data["aspect_ratios"][0][0] * mm_data["aspect_ratios"][0][1]
|
||||
) # x-y separator tokens
|
||||
total_tokens = (
|
||||
total_num_patches.item() + num_tiles.item() + 3
|
||||
) # image start, image, image end
|
||||
|
||||
profiled_tokens = profiler.get_mm_max_contiguous_tokens(
|
||||
max_model_len,
|
||||
@@ -63,5 +67,6 @@ def test_profiling(model_id: str, max_model_len: int):
|
||||
|
||||
assert total_tokens == profiled_tokens["image"]
|
||||
assert total_tokens == sum(
|
||||
placeholder.length for placeholder in
|
||||
decoder_dummy_data.multi_modal_placeholders["image"])
|
||||
placeholder.length
|
||||
for placeholder in decoder_dummy_data.multi_modal_placeholders["image"]
|
||||
)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for Nemotron-Nano-VL's multimodal preprocessing kwargs."""
|
||||
|
||||
from collections.abc import Mapping
|
||||
from typing import Optional
|
||||
|
||||
@@ -24,7 +25,9 @@ def _get_expected_num_patches(
|
||||
max_num: int,
|
||||
):
|
||||
from vllm.model_executor.models.nemotron_vl import (
|
||||
calculate_nemotron_vl_targets, get_nemotron_vl_target_ratios)
|
||||
calculate_nemotron_vl_targets,
|
||||
get_nemotron_vl_target_ratios,
|
||||
)
|
||||
|
||||
width, height = image.size
|
||||
|
||||
@@ -63,22 +66,21 @@ def _run_check(
|
||||
|
||||
total_expected_num_patches = sum(
|
||||
_get_expected_num_patches(config, image, len(images), min_num, max_num)
|
||||
for image in images)
|
||||
for image in images
|
||||
)
|
||||
print(total_expected_num_patches)
|
||||
processed_inputs = processor.apply(prompt, mm_data, mm_processor_kwargs)
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = tokenizer.convert_tokens_to_ids("<image>")
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data(
|
||||
)["pixel_values_flat"].shape
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values_flat"].shape
|
||||
print("Image token count:", img_tok_count, "Pixel shape:", pixel_shape)
|
||||
assert img_tok_count == 256 * total_expected_num_patches
|
||||
assert pixel_shape[0] == total_expected_num_patches
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model_id",
|
||||
["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
|
||||
@pytest.mark.parametrize("model_id", ["nvidia/Llama-3.1-Nemotron-Nano-VL-8B-V1"])
|
||||
@pytest.mark.parametrize(
|
||||
"size_factors",
|
||||
[
|
||||
@@ -125,10 +127,7 @@ def test_processor_override(
|
||||
|
||||
_run_check(
|
||||
processor,
|
||||
[
|
||||
rescale_image_size(image_assets[0].pil_image, f)
|
||||
for f in size_factors
|
||||
],
|
||||
[rescale_image_size(image_assets[0].pil_image, f) for f in size_factors],
|
||||
min_num,
|
||||
max_num,
|
||||
hf_processor_mm_kwargs,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for phi3v's multimodal preprocessing kwargs."""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
@@ -18,7 +19,8 @@ from ...utils import build_model_context
|
||||
({"num_crops": 16}, 1921),
|
||||
# the default num_crops of phi-3.5-vision is 4
|
||||
({}, 757),
|
||||
])
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for phi4mm's multimodal preprocessing kwargs."""
|
||||
|
||||
import pytest
|
||||
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY
|
||||
@@ -18,7 +19,8 @@ from ...utils import build_model_context
|
||||
({"dynamic_hd": 16}, 4433),
|
||||
# the default num_crops of phi-4-multimodal is 36
|
||||
({}, 9585),
|
||||
])
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
@@ -46,8 +48,7 @@ def test_processor_override(
|
||||
img_str = "".join([f"<|image_{idx}|>\n" for idx in range(1, num_imgs + 1)])
|
||||
prompt = f"<|user|>\n{img_str}<|end|>\n<|assistant|>\n"
|
||||
|
||||
image_size = ctx.get_hf_config(
|
||||
).embd_layer["image_embd_layer"]["crop_size"]
|
||||
image_size = ctx.get_hf_config().embd_layer["image_embd_layer"]["crop_size"]
|
||||
dummy_image_size = (image_size * 7, image_size * 7)
|
||||
dummy_image = image_assets[0].pil_image.resize(dummy_image_size)
|
||||
mm_data = {"image": [dummy_image] * num_imgs}
|
||||
@@ -56,5 +57,6 @@ def test_processor_override(
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(
|
||||
_IMAGE_PLACEHOLDER_TOKEN_ID)
|
||||
_IMAGE_PLACEHOLDER_TOKEN_ID
|
||||
)
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
|
||||
@@ -12,10 +12,12 @@ from ...utils import build_model_context
|
||||
@pytest.mark.parametrize("model_id", ["Qwen/Qwen2-VL-2B-Instruct"])
|
||||
# yapf: disable
|
||||
@pytest.mark.parametrize(
|
||||
("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"), [
|
||||
("mm_processor_kwargs", "expected_toks_per_img", "expected_pixels_shape"),
|
||||
[
|
||||
({}, 1426, (5704, 1176)),
|
||||
({"min_pixels": 64**2, "max_pixels": 512**2}, 330, (1320, 1176)),
|
||||
])
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
@@ -48,8 +50,7 @@ def test_processor_override(
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
image_token_id = tokenizer.convert_tokens_to_ids(hf_processor.image_token)
|
||||
img_tok_count = processed_inputs["prompt_token_ids"].count(image_token_id)
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data(
|
||||
)["pixel_values"].shape
|
||||
pixel_shape = processed_inputs["mm_kwargs"].get_data()["pixel_values"].shape
|
||||
|
||||
assert img_tok_count == expected_toks_per_img * num_imgs
|
||||
assert pixel_shape[0] == expected_pixels_shape[0] * num_imgs
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Tests for smolvlm's multimodal preprocessing kwargs."""
|
||||
|
||||
import pytest
|
||||
from transformers import SmolVLMConfig
|
||||
|
||||
@@ -17,7 +18,8 @@ from ...utils import build_model_context
|
||||
[
|
||||
({"max_image_size": {"longest_edge": 384}}, 1377),
|
||||
({"max_image_size": {"longest_edge": 768}}, 405),
|
||||
])
|
||||
],
|
||||
)
|
||||
# yapf: enable
|
||||
@pytest.mark.parametrize("num_imgs", [1, 2])
|
||||
@pytest.mark.parametrize("kwargs_on_init", [True, False])
|
||||
@@ -42,8 +44,11 @@ def test_processor_override(
|
||||
hf_processor_mm_kwargs = {} if kwargs_on_init else mm_processor_kwargs
|
||||
|
||||
# Build the image str / prompt based on the number of images we pass
|
||||
placeholders = "<image>" if num_imgs == 1 else "\n".join(
|
||||
f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
|
||||
placeholders = (
|
||||
"<image>"
|
||||
if num_imgs == 1
|
||||
else "\n".join(f"Image-{i}: <image>\n" for i in range(1, num_imgs + 1))
|
||||
)
|
||||
prompt = f"<|im_start|>User:{placeholders}\n<end_of_utterance>\nAssistant:" # noqa: E501
|
||||
|
||||
# Build mm_data
|
||||
@@ -57,8 +62,7 @@ def test_processor_override(
|
||||
# Ensure the placeholders format are correct
|
||||
hf_processor = processor.info.get_hf_processor(**hf_processor_mm_kwargs)
|
||||
hf_processed_inputs = hf_processor(text=prompt, images=mm_data["image"])
|
||||
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs[
|
||||
"input_ids"][0]
|
||||
assert processed_inputs["prompt_token_ids"] == hf_processed_inputs["input_ids"][0]
|
||||
|
||||
# Ensure we have the right number of placeholders per num_crops size
|
||||
image_token_id = ctx.get_hf_config().image_token_id
|
||||
|
||||
@@ -9,23 +9,29 @@ from typing import Any, Union
|
||||
import numpy as np
|
||||
import pytest
|
||||
import torch.nn as nn
|
||||
from mistral_common.protocol.instruct.messages import (ImageChunk, TextChunk,
|
||||
UserMessage)
|
||||
from mistral_common.protocol.instruct.messages import ImageChunk, TextChunk, UserMessage
|
||||
from mistral_common.protocol.instruct.request import ChatCompletionRequest
|
||||
from PIL import Image
|
||||
|
||||
from vllm.config import ModelConfig, VllmConfig, set_current_vllm_config
|
||||
from vllm.config.multimodal import (AudioDummyOptions, BaseDummyOptions,
|
||||
ImageDummyOptions, VideoDummyOptions)
|
||||
from vllm.distributed import (cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel)
|
||||
from vllm.config.multimodal import (
|
||||
AudioDummyOptions,
|
||||
BaseDummyOptions,
|
||||
ImageDummyOptions,
|
||||
VideoDummyOptions,
|
||||
)
|
||||
from vllm.distributed import (
|
||||
cleanup_dist_env_and_memory,
|
||||
init_distributed_environment,
|
||||
initialize_model_parallel,
|
||||
)
|
||||
from vllm.model_executor.model_loader.utils import set_default_torch_dtype
|
||||
from vllm.model_executor.models.interfaces import (SupportsMultiModal,
|
||||
supports_multimodal)
|
||||
from vllm.model_executor.models.interfaces import (
|
||||
SupportsMultiModal,
|
||||
supports_multimodal,
|
||||
)
|
||||
from vllm.multimodal import MULTIMODAL_REGISTRY, BatchedTensorInputs
|
||||
from vllm.multimodal.processing import (BaseMultiModalProcessor,
|
||||
InputProcessingContext)
|
||||
from vllm.multimodal.processing import BaseMultiModalProcessor, InputProcessingContext
|
||||
from vllm.multimodal.utils import group_mm_kwargs_by_modality
|
||||
from vllm.transformers_utils.tokenizer import cached_tokenizer_from_config
|
||||
from vllm.utils import is_list_of
|
||||
@@ -48,13 +54,15 @@ REPO_ID_TO_SKIP = {
|
||||
}
|
||||
|
||||
ImageInput = list[Image.Image]
|
||||
VideoInput = Union[list[Image.Image], list[np.ndarray],
|
||||
list[tuple[np.ndarray, dict[str, Any]]]]
|
||||
VideoInput = Union[
|
||||
list[Image.Image], list[np.ndarray], list[tuple[np.ndarray, dict[str, Any]]]
|
||||
]
|
||||
AudioInput = list[tuple[np.ndarray, int]]
|
||||
|
||||
|
||||
def _resize_data(_data: Union[Image.Image, np.ndarray],
|
||||
size_factor: float) -> Union[Image.Image, np.ndarray]:
|
||||
def _resize_data(
|
||||
_data: Union[Image.Image, np.ndarray], size_factor: float
|
||||
) -> Union[Image.Image, np.ndarray]:
|
||||
assert size_factor <= 1, "Size factor must be less than 1"
|
||||
# Image input
|
||||
if isinstance(_data, Image.Image):
|
||||
@@ -74,20 +82,18 @@ def _resize_data(_data: Union[Image.Image, np.ndarray],
|
||||
return _data[..., :T, :H, :W, :C]
|
||||
# Audio input
|
||||
elif isinstance(_data, np.ndarray) and _data.ndim == 1:
|
||||
return _data[:int(len(_data) * size_factor)]
|
||||
return _data[: int(len(_data) * size_factor)]
|
||||
raise AssertionError("This line should be unreachable.")
|
||||
|
||||
|
||||
def resize_mm_data(
|
||||
data: Union[ImageInput, VideoInput, AudioInput],
|
||||
size_factors: tuple[float,
|
||||
...]) -> Union[ImageInput, VideoInput, AudioInput]:
|
||||
size_factors = size_factors[:len(data)]
|
||||
data: Union[ImageInput, VideoInput, AudioInput], size_factors: tuple[float, ...]
|
||||
) -> Union[ImageInput, VideoInput, AudioInput]:
|
||||
size_factors = size_factors[: len(data)]
|
||||
if is_list_of(data, (Image.Image, np.ndarray, list)):
|
||||
return [_resize_data(d, s) for d, s in zip(data, size_factors)]
|
||||
elif is_list_of(data, tuple):
|
||||
return [(_resize_data(d, s), meta)
|
||||
for (d, meta), s in zip(data, size_factors)]
|
||||
return [(_resize_data(d, s), meta) for (d, meta), s in zip(data, size_factors)]
|
||||
raise ValueError("Unsupported multimodal data type.")
|
||||
|
||||
|
||||
@@ -116,12 +122,16 @@ def create_batched_mm_kwargs(
|
||||
# Mistral chat outputs tokens directly, rather than text prompts
|
||||
if model_config.tokenizer_mode == "mistral":
|
||||
images = resized_mm_data.get("image", [])
|
||||
request = ChatCompletionRequest(messages=[
|
||||
UserMessage(content=[
|
||||
TextChunk(text=""),
|
||||
*(ImageChunk(image=image) for image in images),
|
||||
]),
|
||||
])
|
||||
request = ChatCompletionRequest(
|
||||
messages=[
|
||||
UserMessage(
|
||||
content=[
|
||||
TextChunk(text=""),
|
||||
*(ImageChunk(image=image) for image in images),
|
||||
]
|
||||
),
|
||||
]
|
||||
)
|
||||
tokenizer = processing_info.get_tokenizer()
|
||||
res = tokenizer.mistral.encode_chat_completion(request)
|
||||
prompt = res.tokens
|
||||
@@ -133,10 +143,7 @@ def create_batched_mm_kwargs(
|
||||
hf_processor_mm_kwargs=processor_inputs.hf_processor_mm_kwargs,
|
||||
tokenization_kwargs=processor_inputs.tokenization_kwargs,
|
||||
)["mm_kwargs"].require_data()
|
||||
items = [
|
||||
item for modality in supported_mm_limits
|
||||
for item in mm_kwargs[modality]
|
||||
]
|
||||
items = [item for modality in supported_mm_limits for item in mm_kwargs[modality]]
|
||||
return group_mm_kwargs_by_modality(
|
||||
items,
|
||||
merge_by_field_config=model_cls.merge_by_field_config,
|
||||
@@ -167,15 +174,17 @@ def initialize_dummy_model(
|
||||
cleanup_dist_env_and_memory()
|
||||
|
||||
|
||||
def get_model_id_to_test(
|
||||
model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
|
||||
def get_model_id_to_test(model_arch_list: Iterable[str]) -> list[tuple[str, str]]:
|
||||
filtered_results = []
|
||||
for model_arch in model_arch_list:
|
||||
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
|
||||
if model_info.extras and model_arch in ARCH_NEEDS_EXTRAS:
|
||||
available_repos = list(
|
||||
map(lambda model_id: (model_arch, model_id),
|
||||
[model_info.default, *model_info.extras.values()]))
|
||||
map(
|
||||
lambda model_id: (model_arch, model_id),
|
||||
[model_info.default, *model_info.extras.values()],
|
||||
)
|
||||
)
|
||||
filtered_results.extend(available_repos)
|
||||
else:
|
||||
filtered_results.append((model_arch, model_info.default))
|
||||
@@ -183,8 +192,8 @@ def get_model_id_to_test(
|
||||
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"model_arch, model_id",
|
||||
get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys()))
|
||||
"model_arch, model_id", get_model_id_to_test(_MULTIMODAL_EXAMPLE_MODELS.keys())
|
||||
)
|
||||
def test_model_tensor_schema(model_arch: str, model_id: str):
|
||||
if model_arch in ARCH_TO_SKIP:
|
||||
pytest.skip(f"Skipping {model_arch} due to {ARCH_TO_SKIP[model_arch]}")
|
||||
@@ -193,12 +202,13 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
|
||||
|
||||
model_info = HF_EXAMPLE_MODELS.get_hf_info(model_arch)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip",
|
||||
check_max_version=False)
|
||||
model_info.check_transformers_version(on_fail="skip", check_max_version=False)
|
||||
|
||||
hf_overrides_fn = partial(dummy_hf_overrides,
|
||||
model_arch=model_arch,
|
||||
exist_overrides=model_info.hf_overrides)
|
||||
hf_overrides_fn = partial(
|
||||
dummy_hf_overrides,
|
||||
model_arch=model_arch,
|
||||
exist_overrides=model_info.hf_overrides,
|
||||
)
|
||||
|
||||
model_config = ModelConfig(
|
||||
model_id,
|
||||
@@ -256,8 +266,11 @@ def test_model_tensor_schema(model_arch: str, model_id: str):
|
||||
|
||||
with initialize_dummy_model(model_cls, model_config) as model:
|
||||
for modality, _, mm_kwargs in create_batched_mm_kwargs(
|
||||
model_cls, model_config, processor):
|
||||
model_cls, model_config, processor
|
||||
):
|
||||
for method_name in inputs_parse_methods:
|
||||
print(f"Testing `{method_name}` with modality={modality} "
|
||||
f"and mm_kwargs{list(mm_kwargs.keys())}")
|
||||
print(
|
||||
f"Testing `{method_name}` with modality={modality} "
|
||||
f"and mm_kwargs{list(mm_kwargs.keys())}"
|
||||
)
|
||||
getattr(model, method_name)(modality=modality, **mm_kwargs)
|
||||
|
||||
@@ -19,7 +19,7 @@ def create_repo_dummy_weights(repo: str) -> Iterable[tuple[str, torch.Tensor]]:
|
||||
"""Create weights from safetensors checkpoint metadata"""
|
||||
metadata = try_get_safetensors_metadata(repo)
|
||||
weight_names = list(metadata.weight_map.keys())
|
||||
with torch.device('meta'):
|
||||
with torch.device("meta"):
|
||||
return ((name, torch.empty(0)) for name in weight_names)
|
||||
|
||||
|
||||
@@ -61,7 +61,8 @@ def test_hf_model_weights_mapper(model_arch: str):
|
||||
hf_overrides=model_info.hf_overrides,
|
||||
skip_tokenizer_init=model_info.skip_tokenizer_init,
|
||||
enforce_eager=model_info.enforce_eager,
|
||||
dtype=model_info.dtype)
|
||||
dtype=model_info.dtype,
|
||||
)
|
||||
model_cls = MULTIMODAL_REGISTRY._get_model_cls(model_config)
|
||||
|
||||
original_weights = create_repo_dummy_weights(model_id)
|
||||
@@ -83,6 +84,7 @@ def test_hf_model_weights_mapper(model_arch: str):
|
||||
|
||||
weights_missing = ref_weight_names - weight_names
|
||||
weights_unmapped = weight_names - ref_weight_names
|
||||
assert (not weights_missing and not weights_unmapped), (
|
||||
assert not weights_missing and not weights_unmapped, (
|
||||
f"Following weights are not mapped correctly: {weights_unmapped}, "
|
||||
f"Missing expected weights: {weights_missing}.")
|
||||
f"Missing expected weights: {weights_missing}."
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user