Convert formatting to use ruff instead of yapf + isort (#26247)

Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
Harry Mellor
2025-10-05 15:06:22 +01:00
committed by GitHub
parent 17edd8a807
commit d6953beb91
1508 changed files with 115244 additions and 94146 deletions

View File

@@ -3,27 +3,40 @@
"""Common tests for testing .generate() functionality for single / multiple
image, embedding, and video support for different VLMs in vLLM.
"""
import math
import os
from collections import defaultdict
from pathlib import PosixPath
import pytest
from transformers import (AutoModel, AutoModelForImageTextToText,
AutoModelForTextToWaveform)
from transformers import (
AutoModel,
AutoModelForImageTextToText,
AutoModelForTextToWaveform,
)
from vllm.platforms import current_platform
from vllm.utils import identity
from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner,
ImageTestAssets, VideoTestAssets, VllmRunner)
from ....utils import (create_new_process_for_each_test, large_gpu_mark,
multi_gpu_marks)
from ....conftest import (
IMAGE_ASSETS,
AudioTestAssets,
HfRunner,
ImageTestAssets,
VideoTestAssets,
VllmRunner,
)
from ....utils import create_new_process_for_each_test, large_gpu_mark, multi_gpu_marks
from ...utils import check_outputs_equal
from .vlm_utils import custom_inputs, model_utils, runners
from .vlm_utils.case_filtering import get_parametrized_options
from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
VLMTestInfo, VLMTestType)
from .vlm_utils.types import (
CustomTestOptions,
ExpandableVLMTestArgs,
VLMTestInfo,
VLMTestType,
)
# This hack is needed for phi3v & paligemma models
# ROCm Triton FA can run into shared memory issues with these models,
@@ -828,7 +841,7 @@ def _mark_splits(
new_test_settings = dict[str, VLMTestInfo]()
for i in range(num_groups):
models_in_group = models[i * split_size:(i + 1) * split_size]
models_in_group = models[i * split_size : (i + 1) * split_size]
for model in models_in_group:
for info in test_infos_by_model[model]:
@@ -859,7 +872,8 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
VLM_TEST_SETTINGS,
test_type=VLMTestType.IMAGE,
create_new_process_for_each_test=False,
))
),
)
def test_single_image_models(
tmp_path: PosixPath,
model_type: str,
@@ -885,7 +899,8 @@ def test_single_image_models(
VLM_TEST_SETTINGS,
test_type=VLMTestType.MULTI_IMAGE,
create_new_process_for_each_test=False,
))
),
)
def test_multi_image_models(
tmp_path: PosixPath,
model_type: str,
@@ -911,7 +926,8 @@ def test_multi_image_models(
VLM_TEST_SETTINGS,
test_type=VLMTestType.EMBEDDING,
create_new_process_for_each_test=False,
))
),
)
def test_image_embedding_models(
model_type: str,
test_case: ExpandableVLMTestArgs,
@@ -935,7 +951,8 @@ def test_image_embedding_models(
VLM_TEST_SETTINGS,
test_type=VLMTestType.VIDEO,
create_new_process_for_each_test=False,
))
),
)
def test_video_models(
model_type: str,
test_case: ExpandableVLMTestArgs,
@@ -959,7 +976,8 @@ def test_video_models(
VLM_TEST_SETTINGS,
test_type=VLMTestType.AUDIO,
create_new_process_for_each_test=False,
))
),
)
def test_audio_models(
model_type: str,
test_case: ExpandableVLMTestArgs,
@@ -983,7 +1001,8 @@ def test_audio_models(
VLM_TEST_SETTINGS,
test_type=VLMTestType.CUSTOM_INPUTS,
create_new_process_for_each_test=False,
))
),
)
def test_custom_inputs_models(
model_type: str,
test_case: ExpandableVLMTestArgs,
@@ -1006,7 +1025,8 @@ def test_custom_inputs_models(
VLM_TEST_SETTINGS,
test_type=VLMTestType.IMAGE,
create_new_process_for_each_test=True,
))
),
)
@create_new_process_for_each_test()
def test_single_image_models_heavy(
tmp_path: PosixPath,
@@ -1033,7 +1053,8 @@ def test_single_image_models_heavy(
VLM_TEST_SETTINGS,
test_type=VLMTestType.MULTI_IMAGE,
create_new_process_for_each_test=True,
))
),
)
@create_new_process_for_each_test()
def test_multi_image_models_heavy(
tmp_path: PosixPath,
@@ -1060,7 +1081,8 @@ def test_multi_image_models_heavy(
VLM_TEST_SETTINGS,
test_type=VLMTestType.EMBEDDING,
create_new_process_for_each_test=True,
))
),
)
@create_new_process_for_each_test()
def test_image_embedding_models_heavy(
model_type: str,
@@ -1085,7 +1107,8 @@ def test_image_embedding_models_heavy(
VLM_TEST_SETTINGS,
test_type=VLMTestType.VIDEO,
create_new_process_for_each_test=True,
))
),
)
def test_video_models_heavy(
model_type: str,
test_case: ExpandableVLMTestArgs,
@@ -1109,7 +1132,8 @@ def test_video_models_heavy(
VLM_TEST_SETTINGS,
test_type=VLMTestType.AUDIO,
create_new_process_for_each_test=True,
))
),
)
def test_audio_models_heavy(
model_type: str,
test_case: ExpandableVLMTestArgs,
@@ -1133,7 +1157,8 @@ def test_audio_models_heavy(
VLM_TEST_SETTINGS,
test_type=VLMTestType.CUSTOM_INPUTS,
create_new_process_for_each_test=True,
))
),
)
@create_new_process_for_each_test()
def test_custom_inputs_models_heavy(
model_type: str,

View File

@@ -10,8 +10,7 @@ from transformers import AutoModelForSpeechSeq2Seq
from vllm.logprobs import SampleLogprobs
from vllm.lora.request import LoRARequest
from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput,
VllmRunner)
from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
from ...registry import HF_EXAMPLE_MODELS
from ...utils import check_logprobs_close
@@ -64,50 +63,49 @@ def run_test(
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
runner="generate",
max_model_len=max_model_len,
max_num_seqs=1,
dtype=dtype,
limit_mm_per_prompt={"audio": 1},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=64,
enforce_eager=True,
model,
runner="generate",
max_model_len=max_model_len,
max_num_seqs=1,
dtype=dtype,
limit_mm_per_prompt={"audio": 1},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=64,
enforce_eager=True,
) as vllm_model:
lora_request = LoRARequest("audio", 1, audio_lora_path)
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=audios,
lora_request=lora_request)
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=audios,
lora_request=lora_request,
)
for prompts, audios in inputs
]
with hf_runner(model, dtype=dtype,
auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
hf_processor = hf_model.processor
eos_token_id = hf_processor.tokenizer.eos_token_id
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=[audios],
eos_token_id=eos_token_id)
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
audios=[audios],
eos_token_id=eos_token_id,
)
for prompts, audios in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
vllm_outputs_per_case):
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=[
vllm_to_hf_output(output) for output in vllm_outputs
],
outputs_1_lst=[vllm_to_hf_output(output) for output in vllm_outputs],
name_0="hf",
name_1="vllm",
)
@@ -118,9 +116,16 @@ def run_test(
@pytest.mark.parametrize("max_model_len", [2048])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(hf_runner, vllm_runner, model: str,
audio_assets: AudioTestAssets, dtype: str, max_model_len: int,
max_tokens: int, num_logprobs: int) -> None:
def test_models(
hf_runner,
vllm_runner,
model: str,
audio_assets: AudioTestAssets,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")

View File

@@ -28,8 +28,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
give the same result.
"""
image_cherry = convert_image_mode(
ImageAsset("cherry_blossom").pil_image, "RGB")
image_cherry = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
images = [image_cherry, image_stop]
video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
@@ -47,29 +46,30 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
),
]
with vllm_runner(model,
runner="generate",
dtype=dtype,
limit_mm_per_prompt={"image": 2},
max_model_len=32768,
max_num_seqs=2,
tensor_parallel_size=1,
enforce_eager=True) as vllm_model:
with vllm_runner(
model,
runner="generate",
dtype=dtype,
limit_mm_per_prompt={"image": 2},
max_model_len=32768,
max_num_seqs=2,
tensor_parallel_size=1,
enforce_eager=True,
) as vllm_model:
vllm_outputs_per_case = [
vllm_model.generate_greedy(prompts,
max_tokens,
images=images,
videos=videos)
vllm_model.generate_greedy(
prompts, max_tokens, images=images, videos=videos
)
for prompts, images, videos in inputs
]
all_results = [output[0][1] for output in vllm_outputs_per_case]
outputs = [(total_str, total_str.find("assistant\n") + len("assistant\n"))
for total_str in all_results]
prompt_lengths = [prompt_len for _, prompt_len in outputs]
generated_strs = [
total_str[prompt_len:] for total_str, prompt_len in outputs
outputs = [
(total_str, total_str.find("assistant\n") + len("assistant\n"))
for total_str in all_results
]
prompt_lengths = [prompt_len for _, prompt_len in outputs]
generated_strs = [total_str[prompt_len:] for total_str, prompt_len in outputs]
interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
interleaved_output_str, noninterleaved_output_str = generated_strs

View File

@@ -18,13 +18,11 @@ from typing import Any
import pytest
import torch
from safetensors.torch import save_file
from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
GenerationConfig)
from transformers import AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig
from vllm import LLM, SamplingParams
from vllm.v1.executor.abstract import Executor
from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
FullAttentionSpec)
from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, FullAttentionSpec
from ....utils import multi_gpu_test
@@ -93,8 +91,7 @@ def get_rope_layers_config(model_path: str) -> list[int]:
def create_reduced_maverick_model(
original_model_name:
str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
original_model_name: str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
output_dir: str = "/tmp/reduced_maverick",
text_layers: int = 4,
num_experts: int = 4,
@@ -118,7 +115,8 @@ def create_reduced_maverick_model(
print(
f"Creating reduced Maverick model with {text_layers} text layers and "
f"{vision_layers} vision layers...")
f"{vision_layers} vision layers..."
)
# Create output directory
output_path = Path(output_dir)
@@ -126,19 +124,23 @@ def create_reduced_maverick_model(
if force_recreate:
shutil.rmtree(output_path)
else:
print(f"Output directory {output_dir} already exists. "
"Use --force-recreate to overwrite.")
print(
f"Output directory {output_dir} already exists. "
"Use --force-recreate to overwrite."
)
return str(output_path)
output_path.mkdir(parents=True, exist_ok=True)
try:
print("Loading original model configuration...")
original_config = AutoConfig.from_pretrained(original_model_name,
trust_remote_code=True)
original_config = AutoConfig.from_pretrained(
original_model_name, trust_remote_code=True
)
print("Creating reduced configuration...")
reduced_config = create_reduced_config(original_config, text_layers,
num_experts, vision_layers)
reduced_config = create_reduced_config(
original_config, text_layers, num_experts, vision_layers
)
config_path = output_path / "config.json"
with open(config_path, "w") as f:
@@ -149,8 +151,7 @@ def create_reduced_maverick_model(
copy_tokenizer_files(original_model_name, output_path)
print("Creating reduced safetensors files...")
create_reduced_safetensors(original_config, reduced_config,
output_path)
create_reduced_safetensors(original_config, reduced_config, output_path)
print("Creating preprocessor config...")
create_preprocessor_config(original_config, output_path)
@@ -173,9 +174,9 @@ def create_reduced_maverick_model(
raise
def create_reduced_config(original_config: Any, text_layers: int,
num_experts: int,
vision_layers: int) -> dict[str, Any]:
def create_reduced_config(
original_config: Any, text_layers: int, num_experts: int, vision_layers: int
) -> dict[str, Any]:
"""Create a reduced configuration based on the original."""
# Convert config to dictionary
@@ -185,23 +186,18 @@ def create_reduced_config(original_config: Any, text_layers: int,
if "text_config" in config_dict:
original_text_layers = config_dict["text_config"]["num_hidden_layers"]
config_dict["text_config"]["num_hidden_layers"] = text_layers
print(
f"Reduced text layers from {original_text_layers} to {text_layers}"
)
print(f"Reduced text layers from {original_text_layers} to {text_layers}")
original_num_experts = config_dict["text_config"]["num_local_experts"]
config_dict["text_config"]["num_local_experts"] = num_experts
print(
f"Reduced num experts from {original_num_experts} to {num_experts}"
)
print(f"Reduced num experts from {original_num_experts} to {num_experts}")
hidden_dim_divisor = 4
original_hidden_size = config_dict["text_config"]["hidden_size"]
new_hidden_size = original_hidden_size // hidden_dim_divisor
config_dict["text_config"]["hidden_size"] = new_hidden_size
print(f"Reduced hidden size from {original_hidden_size} to "
f"{new_hidden_size}")
print(f"Reduced hidden size from {original_hidden_size} to {new_hidden_size}")
original_head_dim = config_dict["text_config"]["head_dim"]
new_head_dim = original_head_dim // hidden_dim_divisor
@@ -210,15 +206,12 @@ def create_reduced_config(original_config: Any, text_layers: int,
# Reduce vision layers
if "vision_config" in config_dict:
original_vision_layers = config_dict["vision_config"][
"num_hidden_layers"]
original_vision_layers = config_dict["vision_config"]["num_hidden_layers"]
config_dict["vision_config"]["num_hidden_layers"] = vision_layers
print(f"Reduced vision layers from {original_vision_layers} "
f"to {vision_layers}")
print(f"Reduced vision layers from {original_vision_layers} to {vision_layers}")
# Update model name to indicate it's a reduced version
config_dict["_name_or_path"] = (
f"reduced_maverick_{text_layers}t_{vision_layers}v")
config_dict["_name_or_path"] = f"reduced_maverick_{text_layers}t_{vision_layers}v"
return config_dict
@@ -227,16 +220,16 @@ def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
"""Copy tokenizer files from the original model."""
try:
tokenizer = AutoTokenizer.from_pretrained(original_model_name,
trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(
original_model_name, trust_remote_code=True
)
tokenizer.save_pretrained(output_path)
print("Tokenizer files copied successfully")
except Exception as e:
print(f"Warning: Could not copy tokenizer files: {e}")
def create_preprocessor_config(original_config: Any,
output_path: Path) -> None:
def create_preprocessor_config(original_config: Any, output_path: Path) -> None:
"""Create preprocessor_config.json for multimodal model."""
# Try to load the original preprocessor config
@@ -254,9 +247,9 @@ def create_preprocessor_config(original_config: Any,
raise
def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
Any],
output_path: Path) -> None:
def create_reduced_safetensors(
original_config: Any, reduced_config: dict[str, Any], output_path: Path
) -> None:
"""Create safetensors files with weights for the reduced model."""
print("Generating synthetic weights for reduced model...")
@@ -279,8 +272,7 @@ def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
save_weights_to_safetensors(weights, output_path)
def create_text_model_weights(
text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
def create_text_model_weights(text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
"""Create synthetic weights for the text model with MoE structure."""
weights = {}
@@ -291,19 +283,18 @@ def create_text_model_weights(
intermediate_size_mlp = text_config["intermediate_size_mlp"]
num_layers = text_config["num_hidden_layers"]
num_attention_heads = text_config["num_attention_heads"]
num_key_value_heads = text_config.get("num_key_value_heads",
num_attention_heads)
num_key_value_heads = text_config.get("num_key_value_heads", num_attention_heads)
# MoE specific parameters
num_experts = text_config.get("num_local_experts")
assert (num_experts
is not None), "num_local_experts must be specified for MoE"
assert num_experts is not None, "num_local_experts must be specified for MoE"
head_dim = hidden_size // num_attention_heads
# Embedding layers
weights["language_model.model.embed_tokens.weight"] = torch.randn(
vocab_size, hidden_size, dtype=torch.float16)
vocab_size, hidden_size, dtype=torch.float16
)
# Transformer layers
for layer_idx in range(num_layers):
@@ -312,95 +303,105 @@ def create_text_model_weights(
# Self-attention weights (separate q, k, v projections)
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16)
hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16)
num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
)
print("Self-attention weights created.")
# Feed-forward weights - MoE pattern based on interleave_moe_layer_step
# For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
# 0,2,4,... are dense
interleave_step = text_config.get("interleave_moe_layer_step", 1)
is_moe_layer = (interleave_step > 0
and (layer_idx + 1) % interleave_step == 0)
is_moe_layer = interleave_step > 0 and (layer_idx + 1) % interleave_step == 0
if is_moe_layer:
# MoE layer structure
# 1. Router weights
weights[
f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
num_experts, hidden_size, dtype=torch.float16)
weights[f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
num_experts, hidden_size, dtype=torch.float16
)
# 2. Individual expert weights (not fused)
for expert_idx in range(num_experts):
expert_prefix = (
f"{layer_prefix}.feed_forward.experts.{expert_idx}")
expert_prefix = f"{layer_prefix}.feed_forward.experts.{expert_idx}"
weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16)
hidden_size, intermediate_size, dtype=torch.bfloat16
)
# Expert weight scales (FP8 quantization)
weights[
f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
intermediate_size, 1, dtype=torch.bfloat16)
weights[f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
intermediate_size, 1, dtype=torch.bfloat16
)
weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
intermediate_size, 1, dtype=torch.bfloat16)
weights[
f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
hidden_size, 1, dtype=torch.bfloat16)
intermediate_size, 1, dtype=torch.bfloat16
)
weights[f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
hidden_size, 1, dtype=torch.bfloat16
)
# 3. Shared expert weights
shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16)
hidden_size, intermediate_size, dtype=torch.bfloat16
)
print(f"MoE feed-forward weights created for layer {layer_idx}.")
else:
# Dense layer structure
weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = (
torch.randn(intermediate_size_mlp,
hidden_size,
dtype=torch.bfloat16))
weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = (
torch.randn(intermediate_size_mlp,
hidden_size,
dtype=torch.bfloat16))
weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = (
torch.randn(hidden_size,
intermediate_size_mlp,
dtype=torch.bfloat16))
weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = torch.randn(
intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = torch.randn(
intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = torch.randn(
hidden_size, intermediate_size_mlp, dtype=torch.bfloat16
)
print(f"Dense feed-forward weights created for layer {layer_idx}.")
# Layer norms
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
weights[
f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16
)
print("Layer norms created.")
# Final layer norm and output projection
weights["language_model.model.norm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights["language_model.lm_head.weight"] = torch.randn(
vocab_size, hidden_size, dtype=torch.bfloat16)
vocab_size, hidden_size, dtype=torch.bfloat16
)
return weights
def create_vision_model_weights(
vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
vision_config: dict[str, Any],
) -> dict[str, torch.Tensor]:
"""Create synthetic weights for the vision model."""
weights = {}
@@ -414,47 +415,62 @@ def create_vision_model_weights(
layer_prefix = f"vision_model.model.layers.{layer_idx}"
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16)
hidden_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16)
hidden_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16)
hidden_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
hidden_size, hidden_size, dtype=torch.bfloat16)
hidden_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
intermediate_size, hidden_size, dtype=torch.bfloat16)
intermediate_size, hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
intermediate_size, dtype=torch.bfloat16)
intermediate_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
hidden_size, intermediate_size, dtype=torch.bfloat16)
hidden_size, intermediate_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
weights[
f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
hidden_size, dtype=torch.bfloat16
)
weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
hidden_size, dtype=torch.bfloat16)
hidden_size, dtype=torch.bfloat16
)
return weights
def create_shared_weights(
text_config: dict[str, Any],
vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
text_config: dict[str, Any], vision_config: dict[str, Any]
) -> dict[str, torch.Tensor]:
"""Create weights for shared components (vision-language connector)"""
weights = {}
@@ -464,13 +480,15 @@ def create_shared_weights(
# Vision-language connector (projects vision features to text space)
weights["multi_modal_projector.linear_1.weight"] = torch.randn(
text_hidden_size, projector_input_dim, dtype=torch.bfloat16)
text_hidden_size, projector_input_dim, dtype=torch.bfloat16
)
return weights
def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
output_path: Path) -> None:
def save_weights_to_safetensors(
weights: dict[str, torch.Tensor], output_path: Path
) -> None:
"""Save weights to safetensors files and create index."""
# Determine how to shard the weights
@@ -507,18 +525,18 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
else:
# Multiple shards
for i, shard in enumerate(shards):
filename = f"model-{i+1:05d}-of-{len(shards):05d}.safetensors"
filename = f"model-{i + 1:05d}-of-{len(shards):05d}.safetensors"
save_file(shard, output_path / filename)
for name in shard:
weight_map[name] = filename
print(f"Saved shard {i+1}/{len(shards)}: {filename}")
print(f"Saved shard {i + 1}/{len(shards)}: {filename}")
# Create index file
index_data = {
"metadata": {
"total_size":
sum(tensor.numel() * tensor.element_size()
for tensor in weights.values())
"total_size": sum(
tensor.numel() * tensor.element_size() for tensor in weights.values()
)
},
"weight_map": weight_map,
}
@@ -528,8 +546,9 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
json.dump(index_data, f, indent=2)
print(f"Created index file: {index_path}")
print(f"Total model size: "
f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
print(
f"Total model size: {index_data['metadata']['total_size'] / (1024**3):.2f} GB"
)
def check_attention_spec_interleaved_rope(
@@ -540,8 +559,7 @@ def check_attention_spec_interleaved_rope(
):
"""Check that the attention spec is correct."""
assert isinstance(llm.llm_engine.model_executor, Executor)
kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs(
)
kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs()
for rank in range(num_ranks):
kv_cache_specs = kv_cache_specs_per_rank[rank]
assert len(kv_cache_specs.keys()) == num_attention_layers
@@ -551,16 +569,14 @@ def check_attention_spec_interleaved_rope(
else:
expected_spec = ChunkedLocalAttentionSpec
assert isinstance(
kv_cache_specs[
f"language_model.model.layers.{i}.self_attn.attn"],
expected_spec)
kv_cache_specs[f"language_model.model.layers.{i}.self_attn.attn"],
expected_spec,
)
def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
"""Test the created reduced model with vLLM."""
sampling_params = SamplingParams(temperature=0.8,
top_p=0.95,
max_tokens=50)
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50)
if should_profile:
llm.start_profile()
@@ -571,15 +587,15 @@ def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
print("Test generation successful!")
for output in outputs:
print(f"Prompt: {output.prompt}")
print(f"Output: "
f"{output.outputs[0].text}")
print(f"Output: {output.outputs[0].text}")
print("-" * 40)
@multi_gpu_test(num_gpus=2)
@pytest.mark.parametrize(
"original_model_name,text_layers,num_experts,vision_layers,",
[("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)])
[("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)],
)
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.parametrize("tp,ep", [(2, True)])
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
@@ -640,7 +656,8 @@ def main():
import argparse
parser = argparse.ArgumentParser(
description="Create a reduced-layer Maverick model")
description="Create a reduced-layer Maverick model"
)
parser.add_argument(
"--output-dir",
default="/tmp/reduced_maverick",
@@ -652,10 +669,7 @@ def main():
default=4,
help="Number of text transformer layers",
)
parser.add_argument("--num-experts",
type=int,
default=4,
help="Number of experts")
parser.add_argument("--num-experts", type=int, default=4, help="Number of experts")
parser.add_argument(
"--vision-layers",
type=int,
@@ -667,12 +681,12 @@ def main():
action="store_true",
help="Force recreation if output directory exists",
)
parser.add_argument("--test",
action="store_true",
help="Test the created model with vLLM")
parser.add_argument("--profile",
action="store_true",
help="Profile the created model with vLLM")
parser.add_argument(
"--test", action="store_true", help="Test the created model with vLLM"
)
parser.add_argument(
"--profile", action="store_true", help="Profile the created model with vLLM"
)
parser.add_argument(
"--test-original",
action="store_true",
@@ -687,16 +701,18 @@ def main():
args = parser.parse_args()
if args.test:
test_dummy_maverick(original_model_name=args.original_model,
output_dir=args.output_dir,
text_layers=args.text_layers,
num_experts=args.num_experts,
vision_layers=args.vision_layers,
force_recreate=args.force_recreate,
tp=2,
ep=True,
enforce_eager=True,
profile=args.profile)
test_dummy_maverick(
original_model_name=args.original_model,
output_dir=args.output_dir,
text_layers=args.text_layers,
num_experts=args.num_experts,
vision_layers=args.vision_layers,
force_recreate=args.force_recreate,
tp=2,
ep=True,
enforce_eager=True,
profile=args.profile,
)
if args.test_original:
run_maverick_serving(args.original_model)

View File

@@ -14,26 +14,35 @@ from vllm.lora.request import LoRARequest
from vllm.multimodal.image import rescale_image_size
from vllm.platforms import current_platform
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
PromptImageInput, VllmRunner)
from ....conftest import (
IMAGE_ASSETS,
HfRunner,
PromptAudioInput,
PromptImageInput,
VllmRunner,
)
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom":
"<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
})
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom": "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
}
)
HF_MULTIIMAGE_IMAGE_PROMPT = (
"<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
)
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct",
revision="refs/pr/70")
model_path = snapshot_download(
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
)
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
speech_question = os.path.join(model_path, "examples",
"what_is_shown_in_this_image.wav")
speech_question = os.path.join(
model_path, "examples", "what_is_shown_in_this_image.wav"
)
models = [model_path]
target_dtype = "half"
@@ -48,8 +57,7 @@ if current_platform.is_rocm():
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], PromptImageInput,
Optional[PromptAudioInput]]],
inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]],
model: str,
*,
max_model_len: int,
@@ -75,28 +83,30 @@ def run_test(
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
task="generate",
max_model_len=max_model_len,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=320,
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
enforce_eager=True,
trust_remote_code=False,
model,
task="generate",
max_model_len=max_model_len,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=320,
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
enforce_eager=True,
trust_remote_code=False,
) as vllm_model:
lora_request = LoRARequest("vision", 1, vision_lora_path)
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
lora_request=lora_request)
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
lora_request=lora_request,
)
for prompts, images, audios in inputs
]
@@ -108,17 +118,18 @@ def run_test(
hf_processor = hf_model.processor
eos_token_id = hf_processor.tokenizer.eos_token_id
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
eos_token_id=eos_token_id)
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
eos_token_id=eos_token_id,
)
for prompts, images, audios in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
vllm_outputs_per_case):
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
@@ -145,16 +156,27 @@ def run_test(
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_model_len: int, max_tokens: int,
num_logprobs: int) -> None:
def test_models(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
None,
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
inputs_per_image = [
(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
None,
)
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
]
run_test(
hf_runner,
@@ -189,16 +211,26 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@pytest.mark.parametrize("max_model_len", [25600])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
size_factors, dtype: str, max_model_len: int,
max_tokens: int, num_logprobs: int) -> None:
def test_multi_images_models(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case = [
(
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
[[rescale_image_size(image, factor) for image in images]
for factor in size_factors],
[
[rescale_image_size(image, factor) for image in images]
for factor in size_factors
],
None,
),
]
@@ -222,10 +254,15 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
max_model_len: int, max_tokens: int,
num_logprobs: int) -> None:
def test_vision_speech_models(
hf_runner,
vllm_runner,
model,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
# use the example speech question so that the model outputs are reasonable
audio = librosa.load(speech_question, sr=16000)
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")

View File

@@ -17,31 +17,39 @@ from vllm.lora.request import LoRARequest
from vllm.multimodal.image import convert_image_mode, rescale_image_size
from vllm.platforms import current_platform
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
PromptImageInput, VllmRunner)
from ....conftest import (
IMAGE_ASSETS,
HfRunner,
PromptAudioInput,
PromptImageInput,
VllmRunner,
)
from ....utils import large_gpu_test
from ...utils import check_logprobs_close
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
"<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom":
"<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
})
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
"cherry_blossom": "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
}
)
HF_MULTIIMAGE_IMAGE_PROMPT = (
"<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
)
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
# Since the vision-lora and speech-lora co-exist with the base model,
# we have to manually specify the path of the lora weights.
vision_lora_path = os.path.join(model_path, "vision-lora")
speech_question = os.path.join(model_path, "examples",
"what_is_shown_in_this_image.wav")
speech_question = os.path.join(
model_path, "examples", "what_is_shown_in_this_image.wav"
)
models = [model_path]
def vllm_to_hf_output(vllm_output: tuple[list[int], str,
Optional[SampleLogprobs]],
model: str):
def vllm_to_hf_output(
vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str
):
"""Sanitize vllm output to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
@@ -71,8 +79,7 @@ if current_platform.is_rocm():
def run_test(
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
inputs: Sequence[tuple[list[str], PromptImageInput,
Optional[PromptAudioInput]]],
inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]],
model: str,
*,
max_model_len: int,
@@ -98,27 +105,29 @@ def run_test(
# will hurt multiprocessing backend with fork method (the default method).
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
runner="generate",
max_model_len=max_model_len,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=320,
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
enforce_eager=True,
model,
runner="generate",
max_model_len=max_model_len,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enable_lora=True,
max_lora_rank=320,
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
enforce_eager=True,
) as vllm_model:
lora_request = LoRARequest("vision", 1, vision_lora_path)
vllm_outputs_per_case = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
lora_request=lora_request)
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
lora_request=lora_request,
)
for prompts, images, audios in inputs
]
@@ -127,42 +136,36 @@ def run_test(
pytest.skip("HF impl is not compatible with current transformers")
hf_model_kwargs = {"_attn_implementation": "sdpa"}
with hf_runner(model, dtype=dtype,
model_kwargs=hf_model_kwargs) as hf_model:
with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
hf_processor = hf_model.processor
eos_token_id = hf_processor.tokenizer.eos_token_id
def patch_hf_processor(*args,
text="",
images=None,
audio=None,
sampling_rate=None,
**kwargs):
def patch_hf_processor(
*args, text="", images=None, audio=None, sampling_rate=None, **kwargs
):
audios = None
if audio is not None and sampling_rate is not None:
audios = [(audio, sampling_rate)]
return hf_processor(*args,
text=text,
images=images,
audios=audios,
**kwargs)
return hf_processor(
*args, text=text, images=images, audios=audios, **kwargs
)
hf_model.processor = patch_hf_processor
hf_outputs_per_case = [
hf_model.generate_greedy_logprobs_limit(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
eos_token_id=eos_token_id,
num_logits_to_keep=0)
hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images,
audios=audios,
eos_token_id=eos_token_id,
num_logits_to_keep=0,
)
for prompts, images, audios in inputs
]
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
vllm_outputs_per_case):
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
check_logprobs_close(
outputs_0_lst=hf_outputs,
outputs_1_lst=vllm_outputs,
@@ -189,16 +192,27 @@ def run_test(
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
dtype: str, max_model_len: int, max_tokens: int,
num_logprobs: int) -> None:
def test_models(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_image = [(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
None,
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
inputs_per_image = [
(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
None,
)
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
]
run_test(
hf_runner,
@@ -233,16 +247,26 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
@pytest.mark.parametrize("max_model_len", [25600])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
size_factors, dtype: str, max_model_len: int,
max_tokens: int, num_logprobs: int) -> None:
def test_multi_images_models(
hf_runner,
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case = [
(
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
[[rescale_image_size(image, factor) for image in images]
for factor in size_factors],
[
[rescale_image_size(image, factor) for image in images]
for factor in size_factors
],
None,
),
]
@@ -266,10 +290,15 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
@pytest.mark.parametrize("max_model_len", [12800])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
max_model_len: int, max_tokens: int,
num_logprobs: int) -> None:
def test_vision_speech_models(
hf_runner,
vllm_runner,
model,
dtype: str,
max_model_len: int,
max_tokens: int,
num_logprobs: int,
) -> None:
# use the example speech question so that the model outputs are reasonable
audio = librosa.load(speech_question, sr=None)
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")

View File

@@ -37,33 +37,33 @@ PROMPT = "Describe each image in one short sentence."
def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
return [{
"role":
"user",
"content": [{
"type": "text",
"text": PROMPT,
}] + [{
"type": "image_url",
"image_url": {
"url": url
}
} for url in urls],
}]
return [
{
"role": "user",
"content": [
{
"type": "text",
"text": PROMPT,
}
]
+ [{"type": "image_url", "image_url": {"url": url}} for url in urls],
}
]
def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
return [{
"role":
"user",
"content": [{
"type": "text",
"content": PROMPT,
}, *({
"type": "image",
"image": download_image(url)
} for url in urls)],
}]
return [
{
"role": "user",
"content": [
{
"type": "text",
"content": PROMPT,
},
*({"type": "image", "image": download_image(url)} for url in urls),
],
}
]
def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
@@ -125,11 +125,17 @@ def _dump_outputs_w_logprobs(
outputs: OutputsLogprobs,
filename: "StrPath",
) -> None:
json_data = [(tokens, text, [{
k: asdict(v)
for k, v in token_logprobs.items()
} for token_logprobs in (logprobs or [])])
for tokens, text, logprobs in outputs]
json_data = [
(
tokens,
text,
[
{k: asdict(v) for k, v in token_logprobs.items()}
for token_logprobs in (logprobs or [])
],
)
for tokens, text, logprobs in outputs
]
with open(filename, "w") as f:
json.dump(json_data, f)
@@ -139,28 +145,35 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
with open(filename, "rb") as f:
json_data = json.load(f)
return [(tokens, text, [{
int(k): Logprob(**v)
for k, v in token_logprobs.items()
} for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
return [
(
tokens,
text,
[
{int(k): Logprob(**v) for k, v in token_logprobs.items()}
for token_logprobs in logprobs
],
)
for tokens, text, logprobs in json_data
]
@large_gpu_test(min_gb=80)
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
@pytest.mark.parametrize("dtype", ["bfloat16"])
def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
local_asset_server) -> None:
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
FIXTURE_LOGPROBS_CHAT[model])
def test_chat(
vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
) -> None:
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
with vllm_runner(
model,
dtype=dtype,
tokenizer_mode="mistral",
load_format="mistral",
config_format="mistral",
max_model_len=max_model_len,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
model,
dtype=dtype,
tokenizer_mode="mistral",
load_format="mistral",
config_format="mistral",
max_model_len=max_model_len,
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
) as vllm_model:
outputs = []
@@ -180,7 +193,9 @@ def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
for i in range(len(logprobs)):
assert logprobs[i][-1] is None
logprobs[i] = logprobs[i][:-1]
check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
outputs_1_lst=logprobs,
name_0="h100_ref",
name_1="output")
check_logprobs_close(
outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
outputs_1_lst=logprobs,
name_0="h100_ref",
name_1="output",
)

View File

@@ -17,14 +17,15 @@ def qwen2_5_vl_chat_template(*query):
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
"baby_reading":
qwen2_5_vl_chat_template(
VIDEO_PLACEHOLDER,
"Describe this video with a short sentence ",
"(no more than 20 words)",
),
})
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
{
"baby_reading": qwen2_5_vl_chat_template(
VIDEO_PLACEHOLDER,
"Describe this video with a short sentence ",
"(no more than 20 words)",
),
}
)
@pytest.mark.core_model
@@ -33,10 +34,15 @@ VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
video_pruning_rate: float,
num_frames: int, dtype: str,
max_tokens: int) -> None:
def test_qwen2_5_vl_evs_functionality(
vllm_runner,
video_assets,
model,
video_pruning_rate: float,
num_frames: int,
dtype: str,
max_tokens: int,
) -> None:
"""Test EVS (Efficient Video Sampling) functionality with different
pruning rates.
"""
@@ -51,19 +57,18 @@ def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
videos = [sampled_vids[0]]
# Initialize model with EVS configuration
with vllm_runner(model,
runner="generate",
max_model_len=4000,
max_num_seqs=1,
dtype=dtype,
limit_mm_per_prompt={"video": 1},
tensor_parallel_size=1,
video_pruning_rate=video_pruning_rate) as vllm_model:
with vllm_runner(
model,
runner="generate",
max_model_len=4000,
max_num_seqs=1,
dtype=dtype,
limit_mm_per_prompt={"video": 1},
tensor_parallel_size=1,
video_pruning_rate=video_pruning_rate,
) as vllm_model:
# Generate output - this should not crash
outputs = vllm_model.generate_greedy(prompts,
max_tokens,
videos=videos)
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
# Basic validation that we got a response
assert len(outputs) == 1
@@ -83,10 +88,15 @@ def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
@pytest.mark.parametrize("num_frames", [16])
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
video_pruning_rate: float,
num_frames: int, dtype: str,
max_tokens: int) -> None:
def test_qwen2_5_vl_evs_batched_videos(
vllm_runner,
video_assets,
model,
video_pruning_rate: float,
num_frames: int,
dtype: str,
max_tokens: int,
) -> None:
"""Test EVS functionality with batched videos.
This test validates that:
@@ -102,23 +112,21 @@ def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
# Test batched videos
prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
videos = [sampled_vids[0],
sampled_vids[0]] # Use same video twice for testing
videos = [sampled_vids[0], sampled_vids[0]] # Use same video twice for testing
# Initialize model with EVS configuration
with vllm_runner(model,
runner="generate",
max_model_len=4000,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"video": 2},
tensor_parallel_size=1,
video_pruning_rate=video_pruning_rate) as vllm_model:
with vllm_runner(
model,
runner="generate",
max_model_len=4000,
max_num_seqs=2,
dtype=dtype,
limit_mm_per_prompt={"video": 2},
tensor_parallel_size=1,
video_pruning_rate=video_pruning_rate,
) as vllm_model:
# Generate output - this should not crash
outputs = vllm_model.generate_greedy(prompts,
max_tokens,
videos=videos)
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
# Basic validation that we got responses for both videos
assert len(outputs) == 2

View File

@@ -11,8 +11,13 @@ from PIL import Image
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
PromptVideoInput, VllmRunner)
from ....conftest import (
IMAGE_ASSETS,
VIDEO_ASSETS,
PromptImageInput,
PromptVideoInput,
VllmRunner,
)
from ...utils import check_logprobs_close
@@ -34,28 +39,29 @@ def qwen2_vl_chat_template(*query):
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
"stop_sign":
qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
"What is the biggest text's content in this image?",
),
"cherry_blossom":
qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
"What is the season shown in this image? ",
"Reply with a short sentence (no more than 20 words)",
),
})
IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
{
"stop_sign": qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
"What is the biggest text's content in this image?",
),
"cherry_blossom": qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
"What is the season shown in this image? ",
"Reply with a short sentence (no more than 20 words)",
),
}
)
VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
"baby_reading":
qwen2_vl_chat_template(
VIDEO_PLACEHOLDER,
"Describe this video with a short sentence ",
"(no more than 20 words)",
),
})
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
{
"baby_reading": qwen2_vl_chat_template(
VIDEO_PLACEHOLDER,
"Describe this video with a short sentence ",
"(no more than 20 words)",
),
}
)
MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
IMAGE_PLACEHOLDER,
@@ -77,17 +83,19 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
def batch_make_image_embeddings(
image_batches: list[Union[Image.Image, list[Image.Image]]], processor,
llm: VllmRunner) -> list[Qwen2VLPromptImageEmbeddingInput]:
image_batches: list[Union[Image.Image, list[Image.Image]]],
processor,
llm: VllmRunner,
) -> list[Qwen2VLPromptImageEmbeddingInput]:
"""batched image embeddings for Qwen2-VL
This will infer all images' embeddings in a single batch,
This will infer all images' embeddings in a single batch,
and split the result according to input batches.
image_batches:
- Single-image batches: `list[Image.Image]`
- Multiple-image batches: `list[list[Image.Image]]]`
returns: `list[Qwen2VLPromptImageEmbeddingInput]`
"""
@@ -108,9 +116,9 @@ def batch_make_image_embeddings(
# image to pixel values
image_processor = processor.image_processor
preprocess_result = image_processor \
.preprocess(images=images, return_tensors="pt") \
.data
preprocess_result = image_processor.preprocess(
images=images, return_tensors="pt"
).data
pixel_values = preprocess_result["pixel_values"]
image_grid_thw = preprocess_result["image_grid_thw"]
@@ -119,12 +127,13 @@ def batch_make_image_embeddings(
with torch.no_grad():
visual = model.visual
pixel_values_on_device = pixel_values.to(visual.device,
dtype=visual.dtype)
image_grid_thw_on_device = image_grid_thw.to(visual.device,
dtype=torch.int64)
return visual(pixel_values_on_device,
grid_thw=image_grid_thw_on_device).cpu()
pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
image_grid_thw_on_device = image_grid_thw.to(
visual.device, dtype=torch.int64
)
return visual(
pixel_values_on_device, grid_thw=image_grid_thw_on_device
).cpu()
image_embeds = torch.concat(llm.apply_model(get_image_embeds))
@@ -137,16 +146,21 @@ def batch_make_image_embeddings(
merge_size = image_processor.merge_size
cur_batch_embed_len = sum(
grid_thw.prod(-1) // merge_size // merge_size
for grid_thw in image_grid_thw[image_counter:image_counter +
cur_batch_image_count])
for grid_thw in image_grid_thw[
image_counter : image_counter + cur_batch_image_count
]
)
result.append({
"image_embeds":
image_embeds[embed_counter:embed_counter + cur_batch_embed_len],
"image_grid_thw":
image_grid_thw[image_counter:image_counter +
cur_batch_image_count],
})
result.append(
{
"image_embeds": image_embeds[
embed_counter : embed_counter + cur_batch_embed_len
],
"image_grid_thw": image_grid_thw[
image_counter : image_counter + cur_batch_image_count
],
}
)
embed_counter += cur_batch_embed_len
image_counter += cur_batch_image_count
@@ -160,13 +174,13 @@ def batch_make_image_embeddings(
def batch_make_video_embeddings(
video_batches: PromptVideoInput, processor,
llm: VllmRunner) -> list[Qwen2VLPromptVideoEmbeddingInput]:
video_batches: PromptVideoInput, processor, llm: VllmRunner
) -> list[Qwen2VLPromptVideoEmbeddingInput]:
"""batched video embeddings for Qwen2-VL
A NDArray represents a single video's all frames.
This will infer all videos' embeddings in a single batch,
This will infer all videos' embeddings in a single batch,
and split the result according to input batches.
video_batches:
@@ -191,9 +205,9 @@ def batch_make_video_embeddings(
# video to pixel values
image_processor = processor.image_processor
preprocess_result = image_processor \
.preprocess(images=None, videos=videos, return_tensors="pt") \
.data
preprocess_result = image_processor.preprocess(
images=None, videos=videos, return_tensors="pt"
).data
pixel_values = preprocess_result["pixel_values_videos"]
video_grid_thw = preprocess_result["video_grid_thw"]
@@ -202,12 +216,13 @@ def batch_make_video_embeddings(
with torch.no_grad():
visual = model.visual
pixel_values_on_device = pixel_values.to(visual.device,
dtype=visual.dtype)
video_grid_thw_on_device = video_grid_thw.to(visual.device,
dtype=torch.int64)
return visual(pixel_values_on_device,
grid_thw=video_grid_thw_on_device).cpu()
pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
video_grid_thw_on_device = video_grid_thw.to(
visual.device, dtype=torch.int64
)
return visual(
pixel_values_on_device, grid_thw=video_grid_thw_on_device
).cpu()
video_embeds = torch.concat(llm.apply_model(get_image_embeds))
@@ -220,16 +235,21 @@ def batch_make_video_embeddings(
merge_size = image_processor.merge_size
cur_batch_embed_len = sum(
grid_thw.prod(-1) // merge_size // merge_size
for grid_thw in video_grid_thw[video_counter:video_counter +
cur_batch_video_count])
for grid_thw in video_grid_thw[
video_counter : video_counter + cur_batch_video_count
]
)
result.append({
"video_embeds":
video_embeds[embed_counter:embed_counter + cur_batch_embed_len],
"video_grid_thw":
video_grid_thw[video_counter:video_counter +
cur_batch_video_count],
})
result.append(
{
"video_embeds": video_embeds[
embed_counter : embed_counter + cur_batch_embed_len
],
"video_grid_thw": video_grid_thw[
video_counter : video_counter + cur_batch_video_count
],
}
)
embed_counter += cur_batch_embed_len
video_counter += cur_batch_video_count
@@ -263,25 +283,24 @@ def run_embedding_input_test(
# max_model_len should be greater than image_feature_size
with vllm_runner(
model,
runner="generate",
max_model_len=4000,
max_num_seqs=3,
dtype=dtype,
limit_mm_per_prompt={
"image": mm_limit,
"video": mm_limit
},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
default_torch_num_threads=1,
model,
runner="generate",
max_model_len=4000,
max_num_seqs=3,
dtype=dtype,
limit_mm_per_prompt={"image": mm_limit, "video": mm_limit},
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
default_torch_num_threads=1,
) as vllm_model:
outputs_per_case_for_original_input = [
vllm_model.generate_greedy_logprobs(prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images or None,
videos=videos or None)
vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=images or None,
videos=videos or None,
)
for prompts, images, videos in inputs
]
@@ -290,17 +309,19 @@ def run_embedding_input_test(
prompts,
max_tokens,
num_logprobs=num_logprobs,
images=batch_make_image_embeddings(
images, processor, vllm_model) if images else None,
videos=batch_make_video_embeddings(
videos, processor, vllm_model) if videos else None)
images=batch_make_image_embeddings(images, processor, vllm_model)
if images
else None,
videos=batch_make_video_embeddings(videos, processor, vllm_model)
if videos
else None,
)
for prompts, images, videos in inputs
]
for outputs_for_original_input, \
outputs_for_embeddings_input \
in zip(outputs_per_case_for_original_input,
outputs_per_case_for_embeddings_input):
for outputs_for_original_input, outputs_for_embeddings_input in zip(
outputs_per_case_for_original_input, outputs_per_case_for_embeddings_input
):
check_logprobs_close(
outputs_0_lst=outputs_for_original_input,
outputs_1_lst=outputs_for_embeddings_input,
@@ -325,17 +346,26 @@ def run_embedding_input_test(
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
size_factors, dtype, max_tokens,
num_logprobs, monkeypatch) -> None:
def test_qwen2_vl_image_embeddings_input(
vllm_runner,
image_assets,
model,
size_factors,
dtype,
max_tokens,
num_logprobs,
monkeypatch,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case: list[tuple[
list[str], PromptImageInput, PromptVideoInput]] = [(
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
(
[prompt for _ in size_factors],
[rescale_image_size(image, factor) for factor in size_factors],
[],
) for image, prompt in zip(images, IMAGE_PROMPTS)]
)
for image, prompt in zip(images, IMAGE_PROMPTS)
]
run_embedding_input_test(
vllm_runner,
@@ -366,21 +396,27 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
model, size_factors,
dtype: str, max_tokens: int,
num_logprobs: int) -> None:
def test_qwen2_vl_multiple_image_embeddings_input(
vllm_runner,
image_assets,
model,
size_factors,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
images = [asset.pil_image for asset in image_assets]
inputs_per_case: list[tuple[list[str], PromptImageInput,
PromptVideoInput]] = [(
[MULTIIMAGE_PROMPT for _ in size_factors],
[[
rescale_image_size(image, factor)
for image in images
] for factor in size_factors],
[],
)]
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
(
[MULTIIMAGE_PROMPT for _ in size_factors],
[
[rescale_image_size(image, factor) for image in images]
for factor in size_factors
],
[],
)
]
run_embedding_input_test(
vllm_runner,
@@ -410,22 +446,29 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
@pytest.mark.parametrize("dtype", [target_dtype])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [10])
def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
size_factors, dtype: str,
max_tokens: int,
num_logprobs: int) -> None:
def test_qwen2_vl_video_embeddings_input(
vllm_runner,
video_assets,
model,
size_factors,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
num_frames = 4
sampled_vids = [
sample_frames_from_video(asset.np_ndarrays, num_frames)
for asset in video_assets
]
inputs_per_case: list[tuple[
list[str], PromptImageInput, PromptVideoInput]] = [(
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
(
[prompt for _ in size_factors],
[],
[rescale_video_size(video, factor) for factor in size_factors],
) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)]
)
for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)
]
run_embedding_input_test(
vllm_runner,

View File

@@ -15,12 +15,12 @@ from ...registry import HF_EXAMPLE_MODELS
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
AUDIO_PROMPTS = AUDIO_ASSETS.prompts({
"mary_had_lamb":
"Transcribe this into English.",
"winning_call":
"What is happening in this audio clip?",
})
AUDIO_PROMPTS = AUDIO_ASSETS.prompts(
{
"mary_had_lamb": "Transcribe this into English.",
"winning_call": "What is happening in this audio clip?",
}
)
MULTI_AUDIO_PROMPT = "Describe each of the audios above."
@@ -33,7 +33,7 @@ CHUNKED_PREFILL_KWARGS = {
"enable_chunked_prefill": True,
"max_num_seqs": 2,
# Use a very small limit to exercise chunked prefill.
"max_num_batched_tokens": 16
"max_num_batched_tokens": 16,
}
@@ -43,27 +43,33 @@ def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
for key, value in params_kwargs.items():
if isinstance(value, bool):
if value:
args.append(f"--{key.replace('_','-')}")
args.append(f"--{key.replace('_', '-')}")
else:
args.append(f"--{key.replace('_','-')}={value}")
args.append(f"--{key.replace('_', '-')}={value}")
return args
@pytest.fixture(params=[
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
])
@pytest.fixture(
params=[
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
]
)
def server(request, audio_assets: AudioTestAssets):
args = [
"--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
"--dtype",
"bfloat16",
"--max-model-len",
"4096",
"--enforce-eager",
"--limit-mm-per-prompt",
json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
json.dumps({"audio": len(audio_assets)}),
"--trust-remote-code",
] + params_kwargs_to_cli_args(request.param)
with RemoteOpenAIServer(MODEL_NAME,
args,
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
"30"}) as remote_server:
with RemoteOpenAIServer(
MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
) as remote_server:
yield remote_server
@@ -77,12 +83,11 @@ def _get_prompt(audio_count, question, placeholder):
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
placeholder = f"{placeholder}\n" * audio_count
return tokenizer.apply_chat_template([{
'role': 'user',
'content': f"{placeholder}{question}"
}],
tokenize=False,
add_generation_prompt=True)
return tokenizer.apply_chat_template(
[{"role": "user", "content": f"{placeholder}{question}"}],
tokenize=False,
add_generation_prompt=True,
)
def run_multi_audio_test(
@@ -99,19 +104,21 @@ def run_multi_audio_test(
model_info.check_available_online(on_fail="skip")
model_info.check_transformers_version(on_fail="skip")
with vllm_runner(model,
dtype=dtype,
enforce_eager=True,
limit_mm_per_prompt={
"audio":
max((len(audio) for _, audio in prompts_and_audios))
},
**kwargs) as vllm_model:
with vllm_runner(
model,
dtype=dtype,
enforce_eager=True,
limit_mm_per_prompt={
"audio": max((len(audio) for _, audio in prompts_and_audios))
},
**kwargs,
) as vllm_model:
vllm_outputs = vllm_model.generate_greedy_logprobs(
[prompt for prompt, _ in prompts_and_audios],
max_tokens,
num_logprobs=num_logprobs,
audios=[audios for _, audios in prompts_and_audios])
audios=[audios for _, audios in prompts_and_audios],
)
# The HuggingFace model doesn't support multiple audios yet, so
# just assert that some tokens were generated.
@@ -122,21 +129,25 @@ def run_multi_audio_test(
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
@pytest.mark.parametrize("vllm_kwargs", [
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
])
def test_models_with_multiple_audios(vllm_runner,
audio_assets: AudioTestAssets, dtype: str,
max_tokens: int, num_logprobs: int,
vllm_kwargs: dict) -> None:
vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT,
VLLM_PLACEHOLDER)
@pytest.mark.parametrize(
"vllm_kwargs",
[
pytest.param({}, marks=pytest.mark.cpu_model),
pytest.param(CHUNKED_PREFILL_KWARGS),
],
)
def test_models_with_multiple_audios(
vllm_runner,
audio_assets: AudioTestAssets,
dtype: str,
max_tokens: int,
num_logprobs: int,
vllm_kwargs: dict,
) -> None:
vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT, VLLM_PLACEHOLDER)
run_multi_audio_test(
vllm_runner,
[(vllm_prompt, [audio.audio_and_sample_rate
for audio in audio_assets])],
[(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
MODEL_NAME,
dtype=dtype,
max_tokens=max_tokens,
@@ -149,28 +160,25 @@ def test_models_with_multiple_audios(vllm_runner,
async def test_online_serving(client, audio_assets: AudioTestAssets):
"""Exercises online serving with/without chunked prefill enabled."""
messages = [{
"role":
"user",
"content": [
*[{
"type": "audio_url",
"audio_url": {
"url": audio.url
}
} for audio in audio_assets],
{
"type":
"text",
"text":
f"What's happening in these {len(audio_assets)} audio clips?"
},
],
}]
messages = [
{
"role": "user",
"content": [
*[
{"type": "audio_url", "audio_url": {"url": audio.url}}
for audio in audio_assets
],
{
"type": "text",
"text": f"What's happening in these {len(audio_assets)} audio clips?",
},
],
}
]
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
messages=messages,
max_tokens=10)
chat_completion = await client.chat.completions.create(
model=MODEL_NAME, messages=messages, max_tokens=10
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]

View File

@@ -6,8 +6,12 @@ import json
import pytest
import pytest_asyncio
from mistral_common.audio import Audio
from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
TextChunk, UserMessage)
from mistral_common.protocol.instruct.messages import (
AudioChunk,
RawAudio,
TextChunk,
UserMessage,
)
from vllm.transformers_utils.tokenizer import MistralTokenizer
@@ -17,8 +21,12 @@ from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
MISTRAL_FORMAT_ARGS = [
"--tokenizer_mode", "mistral", "--config_format", "mistral",
"--load_format", "mistral"
"--tokenizer_mode",
"mistral",
"--config_format",
"mistral",
"--load_format",
"mistral",
]
@@ -30,10 +38,9 @@ def server(request, audio_assets: AudioTestAssets):
json.dumps({"audio": len(audio_assets)}),
] + MISTRAL_FORMAT_ARGS
with RemoteOpenAIServer(MODEL_NAME,
args,
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
"30"}) as remote_server:
with RemoteOpenAIServer(
MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
) as remote_server:
yield remote_server
@@ -64,15 +71,17 @@ def _get_prompt(audio_assets, question):
@pytest.mark.parametrize("dtype", ["half"])
@pytest.mark.parametrize("max_tokens", [128])
@pytest.mark.parametrize("num_logprobs", [5])
def test_models_with_multiple_audios(vllm_runner,
audio_assets: AudioTestAssets, dtype: str,
max_tokens: int,
num_logprobs: int) -> None:
def test_models_with_multiple_audios(
vllm_runner,
audio_assets: AudioTestAssets,
dtype: str,
max_tokens: int,
num_logprobs: int,
) -> None:
vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
run_multi_audio_test(
vllm_runner,
[(vllm_prompt, [audio.audio_and_sample_rate
for audio in audio_assets])],
[(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
MODEL_NAME,
dtype=dtype,
max_tokens=max_tokens,
@@ -92,23 +101,22 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
return audio_dict
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
messages = [{
"role":
"user",
"content": [
*audio_chunks,
{
"type":
"text",
"text":
f"What's happening in these {len(audio_assets)} audio clips?"
},
],
}]
messages = [
{
"role": "user",
"content": [
*audio_chunks,
{
"type": "text",
"text": f"What's happening in these {len(audio_assets)} audio clips?",
},
],
}
]
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
messages=messages,
max_tokens=10)
chat_completion = await client.chat.completions.create(
model=MODEL_NAME, messages=messages, max_tokens=10
)
assert len(chat_completion.choices) == 1
choice = chat_completion.choices[0]

View File

@@ -12,8 +12,7 @@ from ....utils import create_new_process_for_each_test, multi_gpu_test
PROMPTS = [
{
"prompt":
"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
"prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
"multi_modal_data": {
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
},
@@ -25,9 +24,8 @@ PROMPTS = [
"audio": AudioAsset("winning_call").audio_and_sample_rate,
},
},
"decoder_prompt":
"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
}
"decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
},
]
EXPECTED = {
@@ -41,7 +39,7 @@ EXPECTED = {
" is June and the third base. They're going to wave him in. The throw"
" to the plate will be late. The Mariners are going to play for the"
" American League Championship. I don't believe it. It just continues"
" by all five."
" by all five.",
],
"openai/whisper-small": [
" The first words I spoke in the original pornograph. A little piece"
@@ -51,7 +49,7 @@ EXPECTED = {
" comes joy. Here is Junior to third base. They're gonna wave him"
" in. The throw to the plate will be late. The Mariners are going to"
" play for the American League Championship. I don't believe it. It"
" just continues. My, oh my."
" just continues. My, oh my.",
],
"openai/whisper-medium": [
" The first words I spoke in the original phonograph, a little piece"
@@ -62,7 +60,7 @@ EXPECTED = {
" Jorgen at third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh"
" my."
" my.",
],
"openai/whisper-large-v3": [
" The first words I spoke in the original phonograph, a little piece"
@@ -73,7 +71,7 @@ EXPECTED = {
" Junior to third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh,"
" my."
" my.",
],
"openai/whisper-large-v3-turbo": [
" The first words I spoke in the original phonograph, a little piece"
@@ -84,8 +82,8 @@ EXPECTED = {
" Junior to third base. They're going to wave him in. The throw to the"
" plate will be late. The Mariners are going to play for the American"
" League Championship. I don't believe it. It just continues. My, oh,"
" my."
]
" my.",
],
}
@@ -100,11 +98,11 @@ def run_test(
expected_list = EXPECTED[model] * 10
with vllm_runner(
model,
dtype="half",
max_model_len=448,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
model,
dtype="half",
max_model_len=448,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
) as vllm_model:
llm = vllm_model.llm

View File

@@ -1,7 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Helpers for building inputs that can be leveraged for different test types.
"""
"""Helpers for building inputs that can be leveraged for different test types."""
from collections.abc import Iterable
from pathlib import PosixPath
from typing import Callable, Optional, Union
@@ -10,20 +10,30 @@ import torch
from vllm.multimodal.audio import AudioResampler
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import (rescale_video_size, resize_video,
sample_frames_from_video)
from vllm.multimodal.video import (
rescale_video_size,
resize_video,
sample_frames_from_video,
)
from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
from .types import (SINGLE_AUDIO_BASE_PROMPT, SINGLE_IMAGE_BASE_PROMPTS,
TEST_AUDIO_PLACEHOLDER, TEST_IMG_PLACEHOLDER,
TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
ImageSizeWrapper, PromptWithMultiModalInput, SizeType,
VLMTestInfo)
from .types import (
SINGLE_AUDIO_BASE_PROMPT,
SINGLE_IMAGE_BASE_PROMPTS,
TEST_AUDIO_PLACEHOLDER,
TEST_IMG_PLACEHOLDER,
TEST_VIDEO_PLACEHOLDER,
VIDEO_BASE_PROMPT,
ImageSizeWrapper,
PromptWithMultiModalInput,
SizeType,
VLMTestInfo,
)
def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
str],
test_placeholder: str) -> str:
def replace_test_placeholder(
prompt: str, mm_idx_to_prompt: Callable[[int], str], test_placeholder: str
) -> str:
"""Given a prompt, replaces each test placeholder with the
model-specific tag.
"""
@@ -35,11 +45,13 @@ def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
return img_prompt
def get_model_prompts(base_prompts: Iterable[str],
img_idx_to_prompt: Optional[Callable[[int], str]],
video_idx_to_prompt: Optional[Callable[[int], str]],
audio_idx_to_prompt: Optional[Callable[[int], str]],
prompt_formatter: Callable[[str], str]) -> list[str]:
def get_model_prompts(
base_prompts: Iterable[str],
img_idx_to_prompt: Optional[Callable[[int], str]],
video_idx_to_prompt: Optional[Callable[[int], str]],
audio_idx_to_prompt: Optional[Callable[[int], str]],
prompt_formatter: Callable[[str], str],
) -> list[str]:
"""Given a model-agnostic base prompt and test configuration for a model(s)
to be tested, update the media placeholders and apply the prompt formatting
to get the test prompt string for this model.
@@ -56,19 +68,19 @@ def get_model_prompts(base_prompts: Iterable[str],
# Replace the multimodal placeholders in the base prompt with
# the correct ones for the model that we are testing
if img_idx_to_prompt:
base_prompt = replace_test_placeholder(base_prompt,
img_idx_to_prompt,
TEST_IMG_PLACEHOLDER)
base_prompt = replace_test_placeholder(
base_prompt, img_idx_to_prompt, TEST_IMG_PLACEHOLDER
)
if video_idx_to_prompt:
base_prompt = replace_test_placeholder(base_prompt,
video_idx_to_prompt,
TEST_VIDEO_PLACEHOLDER)
base_prompt = replace_test_placeholder(
base_prompt, video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER
)
if audio_idx_to_prompt:
base_prompt = replace_test_placeholder(base_prompt,
audio_idx_to_prompt,
TEST_AUDIO_PLACEHOLDER)
base_prompt = replace_test_placeholder(
base_prompt, audio_idx_to_prompt, TEST_AUDIO_PLACEHOLDER
)
# Apply the prompt formatter to wrap the base prompt with
# the correct media placeholders to get the model test prompt
@@ -84,14 +96,15 @@ def build_single_image_inputs_from_test_info(
tmp_path: Optional[PosixPath] = None,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError(
"Prompt formatter must be set to build single image inputs")
raise ValueError("Prompt formatter must be set to build single image inputs")
model_prompts = get_model_prompts(test_info.single_image_prompts,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter)
model_prompts = get_model_prompts(
test_info.single_image_prompts,
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
# For models that require a local path / URL encoded in the image; export
# assets and encode into tmp_path for this test. This should be avoided
@@ -110,8 +123,8 @@ def build_single_image_inputs_from_test_info(
def build_single_image_inputs(
images, model_prompts,
size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
images, model_prompts, size_wrapper: ImageSizeWrapper
) -> list[PromptWithMultiModalInput]:
# For every image / prompt pair, get a pair containing two lists of
# length size_factors, where the first contains duplicates of the model
# prompt [str], and the second contains copies of the image after being
@@ -125,7 +138,8 @@ def build_single_image_inputs(
apply_image_size_scaling(image, size, size_wrapper.type)
for size in size_wrapper.data
],
) for image, prompt in zip(images, model_prompts)
)
for image, prompt in zip(images, model_prompts)
]
@@ -136,14 +150,15 @@ def build_multi_image_inputs_from_test_info(
tmp_path: Optional[PosixPath] = None,
) -> list[PromptWithMultiModalInput]:
if test_info.prompt_formatter is None:
raise ValueError(
"Prompt formatter must be set to build multi image inputs")
raise ValueError("Prompt formatter must be set to build multi image inputs")
model_prompts = get_model_prompts([test_info.multi_image_prompt],
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter)
model_prompts = get_model_prompts(
[test_info.multi_image_prompt],
test_info.img_idx_to_prompt,
test_info.video_idx_to_prompt,
test_info.audio_idx_to_prompt,
test_info.prompt_formatter,
)
if test_info.prompt_path_encoder is not None:
if tmp_path is None:
@@ -164,16 +179,20 @@ def build_multi_image_inputs_from_test_info(
def build_multi_image_inputs(
image_lists, model_prompts,
size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
image_lists, model_prompts, size_wrapper: ImageSizeWrapper
) -> list[PromptWithMultiModalInput]:
return [
PromptWithMultiModalInput(
prompts=[prompt for _ in size_wrapper.data],
image_data=[[
apply_image_size_scaling(image, size, size_wrapper.type)
for image in images
] for size in size_wrapper.data],
) for images, prompt in zip(image_lists, model_prompts)
image_data=[
[
apply_image_size_scaling(image, size, size_wrapper.type)
for image in images
]
for size in size_wrapper.data
],
)
for images, prompt in zip(image_lists, model_prompts)
]
@@ -185,10 +204,10 @@ def build_embedding_inputs_from_test_info(
# These conditions will always be true if invoked through filtering,
# but we still check them in case this is ever called directly
if test_info.prompt_formatter is None:
raise ValueError(
"Prompt formatter must be set to build image embedding inputs")
if size_wrapper.type != SizeType.SIZE_FACTOR or not \
all(factor == 1.0 for factor in size_wrapper.data):
raise ValueError("Prompt formatter must be set to build image embedding inputs")
if size_wrapper.type != SizeType.SIZE_FACTOR or not all(
factor == 1.0 for factor in size_wrapper.data
):
raise ValueError("Embedding tests require constant (1.0) size factors")
if test_info.convert_assets_to_embeddings is None:
raise ValueError("No conversion func for getting embeddings found")
@@ -209,8 +228,7 @@ def build_embedding_inputs_from_test_info(
assert len(images) == len(model_prompts)
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
size_wrapper)
vllm_embeddings = build_single_image_inputs(embeds, model_prompts, size_wrapper)
return inputs, vllm_embeddings
@@ -235,21 +253,22 @@ def build_video_inputs_from_test_info(
for asset in video_assets
]
video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
else rescale_video_size)
video_scaler = (
resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size
)
return [
PromptWithMultiModalInput(
prompts=[prompt for _ in size_wrapper.data],
video_data=[
video_scaler(video, size) for size in size_wrapper.data
],
) for video, prompt in zip(sampled_vids, model_prompts)
video_data=[video_scaler(video, size) for size in size_wrapper.data],
)
for video, prompt in zip(sampled_vids, model_prompts)
]
def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
size_type: SizeType):
def apply_image_size_scaling(
image, size: Union[float, tuple[int, int]], size_type: SizeType
):
"""Applies a size scaler to one image; this can be an image size factor,
which scales the image while maintaining the aspect ratio"""
# Special case for embeddings; if it's a tensor, it's only valid if we
@@ -285,13 +304,16 @@ def build_audio_inputs_from_test_info(
method="librosa",
)
audios = [asset.audio_and_sample_rate for asset in audio_assets]
resampled_audios = [(
resampler.resample(
audio,
orig_sr=sr,
),
int(resampler.target_sr),
) for audio, sr in audios]
resampled_audios = [
(
resampler.resample(
audio,
orig_sr=sr,
),
int(resampler.target_sr),
)
for audio, sr in audios
]
return [
PromptWithMultiModalInput(

View File

@@ -4,19 +4,28 @@
modality, getting all combinations (similar to pytest's parametrization),
handling multimodal placeholder substitution, and so on.
"""
import itertools
from collections import OrderedDict
from collections.abc import Iterable
import pytest
from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
from .types import (
EMBEDDING_SIZE_FACTORS,
ExpandableVLMTestArgs,
ImageSizeWrapper,
SizeType,
VLMTestInfo,
VLMTestType,
)
def get_filtered_test_settings(
test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType,
new_proc_per_test: bool,
) -> dict[str, VLMTestInfo]:
"""Given the dict of potential test settings to run, return a subdict
of tests who have the current test type enabled with the matching val for
fork_per_test.
@@ -25,7 +34,8 @@ def get_filtered_test_settings(
def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
return test_info.test_type == test_type or (
isinstance(test_info.test_type, Iterable)
and test_type in test_info.test_type)
and test_type in test_info.test_type
)
matching_tests = {}
for test_name, test_info in test_settings.items():
@@ -36,62 +46,69 @@ def get_filtered_test_settings(
assert test_info.convert_assets_to_embeddings is not None
# Custom test inputs need to explicitly define the mm limit/inputs
if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
assert (test_info.custom_test_opts is not None
and isinstance(test_info.custom_test_opts, Iterable))
assert test_info.custom_test_opts is not None and isinstance(
test_info.custom_test_opts, Iterable
)
# For all types besides custom inputs, we need a prompt formatter
else:
assert test_info.prompt_formatter is not None
# Everything looks okay; keep if this is correct proc handling
if (test_info.distributed_executor_backend
is not None) == new_proc_per_test:
if (
test_info.distributed_executor_backend is not None
) == new_proc_per_test:
matching_tests[test_name] = test_info
return matching_tests
def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType,
create_new_process_for_each_test: bool):
def get_parametrized_options(
test_settings: dict[str, VLMTestInfo],
test_type: VLMTestType,
create_new_process_for_each_test: bool,
):
"""Converts all of our VLMTestInfo into an expanded list of parameters.
This is similar to nesting pytest parametrize calls, but done directly
through an itertools product so that each test can set things like
size factors etc, while still running in isolated test cases.
"""
matching_tests = get_filtered_test_settings(
test_settings, test_type, create_new_process_for_each_test)
test_settings, test_type, create_new_process_for_each_test
)
# Ensure that something is wrapped as an iterable it's not already
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
# This is essentially the same as nesting a bunch of mark.parametrize
# decorators, but we do it programmatically to allow overrides for on
# a per-model basis, while still being able to execute each of these
# as individual test cases in pytest.
iter_kwargs = OrderedDict([
("model", ensure_wrapped(test_info.models)),
("max_tokens", ensure_wrapped(test_info.max_tokens)),
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
("dtype", ensure_wrapped(test_info.dtype)),
("distributed_executor_backend",
ensure_wrapped(test_info.distributed_executor_backend)),
])
iter_kwargs = OrderedDict(
[
("model", ensure_wrapped(test_info.models)),
("max_tokens", ensure_wrapped(test_info.max_tokens)),
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
("dtype", ensure_wrapped(test_info.dtype)),
(
"distributed_executor_backend",
ensure_wrapped(test_info.distributed_executor_backend),
),
]
)
# num_frames is video only
if test_type == VLMTestType.VIDEO:
iter_kwargs["num_video_frames"] = ensure_wrapped(
test_info.num_video_frames)
iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
# No sizes passed for custom inputs, since inputs are directly provided
if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
if wrapped_sizes is None:
raise ValueError(
f"Sizes must be set for test type {test_type}")
raise ValueError(f"Sizes must be set for test type {test_type}")
iter_kwargs["size_wrapper"] = wrapped_sizes
#Otherwise expand the custom test options instead
# Otherwise expand the custom test options instead
elif test_type == VLMTestType.CUSTOM_INPUTS:
if test_info.custom_test_opts is None:
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
@@ -121,8 +138,8 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
def get_wrapped_test_sizes(
test_info: VLMTestInfo,
test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]:
test_info: VLMTestInfo, test_type: VLMTestType
) -> tuple[ImageSizeWrapper, ...]:
"""Given a test info which may have size factors or fixed sizes, wrap them
and combine them into an iterable, each of which will be used in parameter
expansion.
@@ -133,18 +150,18 @@ def get_wrapped_test_sizes(
"""
# If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
if test_type == VLMTestType.EMBEDDING:
return tuple([
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
for factor in EMBEDDING_SIZE_FACTORS
])
return tuple(
[
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
for factor in EMBEDDING_SIZE_FACTORS
]
)
# Audio and Custom inputs have preprocessed inputs
elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
return tuple()
size_factors = test_info.image_size_factors \
if test_info.image_size_factors else []
fixed_sizes = test_info.image_sizes \
if test_info.image_sizes else []
size_factors = test_info.image_size_factors if test_info.image_size_factors else []
fixed_sizes = test_info.image_sizes if test_info.image_sizes else []
wrapped_factors = [
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
@@ -152,8 +169,7 @@ def get_wrapped_test_sizes(
]
wrapped_sizes = [
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
for size in fixed_sizes
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size) for size in fixed_sizes
]
return tuple(wrapped_factors + wrapped_sizes)

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Core test implementation to be shared across modalities."""
from typing import Any, Callable, Optional
import torch
@@ -70,22 +71,23 @@ def run_test(
if model_info.hf_overrides:
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
if model_info.skip_tokenizer_init:
vllm_runner_kwargs_[
"skip_tokenizer_init"] = model_info.skip_tokenizer_init
vllm_runner_kwargs_["skip_tokenizer_init"] = model_info.skip_tokenizer_init
if vllm_runner_kwargs:
vllm_runner_kwargs_.update(vllm_runner_kwargs)
with vllm_runner(model,
max_model_len=max_model_len,
max_num_seqs=max_num_seqs,
dtype=dtype,
limit_mm_per_prompt=limit_mm_per_prompt,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=enforce_eager,
runner=runner,
**vllm_runner_kwargs_) as vllm_model:
with vllm_runner(
model,
max_model_len=max_model_len,
max_num_seqs=max_num_seqs,
dtype=dtype,
limit_mm_per_prompt=limit_mm_per_prompt,
tensor_parallel_size=tensor_parallel_size,
distributed_executor_backend=distributed_executor_backend,
enforce_eager=enforce_eager,
runner=runner,
**vllm_runner_kwargs_,
) as vllm_model:
tokenizer = vllm_model.llm.get_tokenizer()
vllm_kwargs: dict[str, Any] = {}
@@ -95,21 +97,19 @@ def run_test(
vllm_kwargs["stop"] = stop_str
for prompts, image_data, video_data, audio_data in vllm_inputs:
mm_data = dict(images=image_data,
videos=video_data,
audios=audio_data)
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
vllm_output = vllm_model.generate_greedy_logprobs(
prompts,
max_tokens,
num_logprobs=num_logprobs,
**vllm_kwargs_with_mm_data)
**vllm_kwargs_with_mm_data,
)
vllm_outputs_per_mm.append(vllm_output)
hf_model = hf_runner(model,
dtype=dtype,
auto_cls=auto_cls,
model_kwargs=hf_model_kwargs)
hf_model = hf_runner(
model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
)
# Some models need to patch things like the model processor, e.g., internvl
if patch_hf_runner is not None:
@@ -129,16 +129,15 @@ def run_test(
hf_kwargs["stop_strings"] = stop_str
for prompts, image_data, video_data, audio_data in inputs:
mm_data = dict(images=image_data,
videos=video_data,
audios=audio_data)
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
hf_kwargs_with_mm_data = hf_kwargs | mm_data
hf_output = hf_model.generate_greedy_logprobs_limit(
prompts,
max_tokens,
num_logprobs=num_logprobs,
tokenizer=tokenizer,
**hf_kwargs_with_mm_data)
**hf_kwargs_with_mm_data,
)
hf_outputs_per_mm.append(hf_output)
# Apply output processing / sanitation to the vLLM and HF runner results
@@ -150,8 +149,7 @@ def run_test(
second_runner_processor=vllm_output_post_proc,
)
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
vllm_outputs_per_mm):
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm, vllm_outputs_per_mm):
# This is usually check_logprobs_close, but it's passed through to
# allow things like check_outputs_equal where needed
comparator(
@@ -171,15 +169,19 @@ def process_runner_outputs(
):
"""Applies the runner processor(s) to the runner outputs, if any."""
if first_runner_processor is not None:
first_runner_outputs = process_outputs(first_runner_processor, model,
first_runner_outputs)
first_runner_outputs = process_outputs(
first_runner_processor, model, first_runner_outputs
)
if second_runner_processor is not None:
second_runner_outputs = process_outputs(second_runner_processor, model,
second_runner_outputs)
second_runner_outputs = process_outputs(
second_runner_processor, model, second_runner_outputs
)
return first_runner_outputs, second_runner_outputs
def process_outputs(output_processor, model, outputs_per_image):
"""Applies a model specific post-processor function to a runner's output"""
return [[output_processor(res, model) for res in outputs]
for outputs in outputs_per_image]
return [
[output_processor(res, model) for res in outputs]
for outputs in outputs_per_image
]

View File

@@ -1,12 +1,16 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Custom input builders for edge-cases in different models."""
from typing import Callable
from vllm.assets.image import ImageAsset
from vllm.multimodal.image import rescale_image_size
from vllm.multimodal.video import (rescale_video_size, resize_video,
sample_frames_from_video)
from vllm.multimodal.video import (
rescale_video_size,
resize_video,
sample_frames_from_video,
)
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
from .builders import build_multi_image_inputs, build_single_image_inputs
@@ -15,7 +19,7 @@ from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType
def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
"""Builds inputs for multi-image (varied sizes/aspect ratio) testing.
Args:
formatter: model-specific prompt formatter.
"""
@@ -41,7 +45,7 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
stop_sign,
rescale_image_size(stop_sign, 0.25),
cherry_blossom.resize((183, 488)),
cherry_blossom.resize((488, 183))
cherry_blossom.resize((488, 183)),
],
cherry_blossom,
]
@@ -54,10 +58,11 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
]
def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
num_frames: int = 16):
def multi_video_multi_aspect_ratio_inputs(
formatter: Callable[[str], str], num_frames: int = 16
):
"""Builds inputs for multi-video (varied sizes/aspect ratio) testing.
Args:
formatter: model-specific prompt formatter.
"""
@@ -81,7 +86,7 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
video,
rescale_video_size(video, 0.25),
resize_video(video, (183, 488)),
resize_video(video, (488, 183))
resize_video(video, (488, 183)),
],
video,
]
@@ -96,7 +101,9 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
def different_patch_input_cases_internvl():
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
formatter = (
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"
) # noqa: E501
single_img_prompts = [
"<image>\nWhat's the content in the center of the image?",
"<image>\nWhat is the season?",
@@ -115,14 +122,14 @@ def different_patch_input_cases_internvl():
def windows_attention_image_qwen2_5_vl():
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
image = ImageAsset("hato").pil_image
question = "Describe the image."
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
"<|im_start|>assistant\n")
prompt = (
f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n<|im_start|>assistant\n"
)
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
return build_single_image_inputs([image], [prompt], wrapped_sf)
@@ -136,8 +143,9 @@ def video_with_metadata_glm4_1v():
formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
scales = [0.1, 0.2, 0.25]
video_input = [[(rescale_video_size(video_array, scale), metadata)]
for scale in scales]
video_input = [
[(rescale_video_size(video_array, scale), metadata)] for scale in scales
]
prompts = [formatted_prompt] * len(video_input)
return [

View File

@@ -4,6 +4,7 @@
for manipulating the input / output of HF & vLLM test runners, which are
typically specific to a small subset of models.
"""
import types
from pathlib import PosixPath
from typing import Optional, Union
@@ -15,8 +16,13 @@ import pytest
import regex as re
import torch
from PIL.Image import Image
from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
GenerationConfig, GenerationMixin)
from transformers import (
AutoConfig,
AutoTokenizer,
BatchFeature,
GenerationConfig,
GenerationMixin,
)
from transformers.video_utils import VideoMetadata
from vllm.logprobs import SampleLogprobs
@@ -27,8 +33,7 @@ from .types import RunnerOutput
####### vLLM output processors functions
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
"""Sanitize vllm output [blip2 models] to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
@@ -42,8 +47,7 @@ def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
return hf_output_ids, hf_output_str, out_logprobs
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
"""Sanitize vllm output [fuyu models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@@ -53,8 +57,8 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
def qwen_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
vllm_output: RunnerOutput, model: str
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
"""Sanitize vllm output [qwen models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@@ -64,8 +68,8 @@ def qwen_vllm_to_hf_output(
def qwen2_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
vllm_output: RunnerOutput, model: str
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
"""Sanitize vllm output [qwen2 models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@@ -75,8 +79,8 @@ def qwen2_vllm_to_hf_output(
def kimiv_vl_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
vllm_output: RunnerOutput, model: str
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
"""Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@@ -85,23 +89,25 @@ def kimiv_vl_vllm_to_hf_output(
return output_ids, hf_output_str, out_logprobs
def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def llava_image_vllm_to_hf_output(
vllm_output: RunnerOutput, model: str
) -> RunnerOutput:
config = AutoConfig.from_pretrained(model)
mm_token_id = config.image_token_index
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
def llava_video_vllm_to_hf_output(
vllm_output: RunnerOutput,
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
vllm_output: RunnerOutput, model: str
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
config = AutoConfig.from_pretrained(model)
mm_token_id = config.video_token_index
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
mm_token_id: int) -> RunnerOutput:
def _llava_vllm_to_hf_output(
vllm_output: RunnerOutput, model: str, mm_token_id: int
) -> RunnerOutput:
"""Sanitize vllm output [Llava models] to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@@ -109,7 +115,8 @@ def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
eos_token_id = tokenizer.eos_token_id
hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids)
token_id
for idx, token_id in enumerate(output_ids)
if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
]
@@ -128,8 +135,9 @@ def llava_onevision_hf_model_kwargs(model: str) -> dict:
return config.to_dict()
def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def llava_onevision_vllm_to_hf_output(
vllm_output: RunnerOutput, model: str
) -> RunnerOutput:
"""Sanitize vllm output [llava-onevision] to compare with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@@ -140,7 +148,8 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
eos_token_id = tokenizer.eos_token_id
hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids)
token_id
for idx, token_id in enumerate(output_ids)
if token_id != video_token_id or output_ids[idx - 1] != video_token_id
]
@@ -151,8 +160,7 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
return hf_output_ids, hf_output_str, out_logprobs
def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def mantis_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
"""Sanitize vllm output [mantis] to compare with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@@ -161,8 +169,7 @@ def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
return output_ids, hf_output_str, out_logprobs
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
"""Sanitize vllm output [phi3v] to be comparable with hf output."""
_, output_str, out_logprobs = vllm_output
@@ -180,8 +187,7 @@ def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
return hf_output_ids, hf_output_str, out_logprobs
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
model: str) -> RunnerOutput:
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
"""Sanitize vllm output to be comparable with hf output."""
output_ids, output_str, out_logprobs = vllm_output
@@ -192,7 +198,8 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
eos_token_id = tokenizer.eos_token_id
hf_output_ids = [
token_id for idx, token_id in enumerate(output_ids)
token_id
for idx, token_id in enumerate(output_ids)
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
]
@@ -205,46 +212,40 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
####### Post-processors for HF outputs
def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<end▁of▁sentence>"):
output_str = output_str.split("<end▁of▁sentence>")[0]
return output_ids, output_str, out_logprobs
def idefics3_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def idefics3_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<end_of_utterance>"):
output_str = output_str.split("<end_of_utterance>")[0]
return output_ids, output_str, out_logprobs
def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def smolvlm_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
# Based on Idefics3
return idefics3_trunc_hf_output(hf_output, model)
def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<|eot_id|>"):
output_str = output_str.split("<|eot_id|>")[0]
return output_ids, output_str, out_logprobs
def minimax_vl_01_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def minimax_vl_01_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
if output_str.endswith("<end_of_sentence>"):
output_str = output_str.split("<end_of_sentence>")[0]
return output_ids, output_str, out_logprobs
def ultravox_trunc_hf_output(hf_output: RunnerOutput,
model: str) -> RunnerOutput:
def ultravox_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
output_ids, output_str, out_logprobs = hf_output
tokenizer = AutoTokenizer.from_pretrained(model)
@@ -262,8 +263,8 @@ def get_llava_embeddings(image_assets: ImageTestAssets):
####### Prompt path encoders for models that need models on disk
def qwen_prompt_path_encoder(
tmp_path: PosixPath, prompt: str,
assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], ImageTestAssets]
) -> str:
"""Given a temporary dir path, export one or more image assets into the
tempdir & replace its contents with the local path to the string so that
the HF version of Qwen-VL can resolve the path and load the image in its
@@ -313,8 +314,9 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
return BatchFeature(data=inputs, tensor_type="pt")
hf_model.processor = processor
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language.model.embed_tokens
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.language.model.embed_tokens
)
return hf_model
@@ -357,11 +359,10 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
assert len(contents) == len(images)
return hf_processor.apply_chat_template(
[{
"role": "user",
"image": image,
"content": content
} for image, content in zip(images, contents)],
[
{"role": "user", "image": image, "content": content}
for image, content in zip(images, contents)
],
add_generation_prompt=True,
tokenize=True,
return_dict=True,
@@ -369,8 +370,9 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
)
hf_model.processor = processor
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.transformer.output_layer
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.transformer.output_layer
)
return hf_model
@@ -387,10 +389,9 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
else:
video_metadata = None
return hf_processor(*args,
videos=videos,
video_metadata=video_metadata,
**kwargs)
return hf_processor(
*args, videos=videos, video_metadata=video_metadata, **kwargs
)
hf_model.processor = processor
return hf_model
@@ -406,8 +407,9 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.num_image_token = hf_runner.model.num_image_token
self.tokenizer = hf_runner.tokenizer
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
trust_remote_code=True)
self.config = AutoConfig.from_pretrained(
hf_runner.model_name, trust_remote_code=True
)
self.vision_config = self.config.vision_config
self.use_thumbnail = self.config.use_thumbnail
self.use_msac = self.config.use_msac
@@ -415,11 +417,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.max_num = self.config.max_dynamic_patch
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Union[Image, list[Image]],
**kwargs):
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
# yapf: disable
from vllm.model_executor.models.h2ovl import (
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
IMG_CONTEXT,
IMG_END,
IMG_START,
image_to_pixel_values_h2ovl,
)
# yapf: enable
images = [images] if isinstance(images, Image) else images
@@ -431,29 +436,26 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
max_num=self.max_num,
use_thumbnail=self.use_thumbnail,
use_msac=self.use_msac,
) for image in images
]
num_patches_list = [
pixel_value.shape[0] for pixel_value in pixel_values
)
for image in images
]
num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
pixel_values = torch.cat(pixel_values, dim=0)
for num_patches in num_patches_list:
context_tokens = IMG_CONTEXT * self.num_image_token \
* num_patches
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
image_tokens = IMG_START + context_tokens + IMG_END
text = text.replace('<image>', image_tokens, 1)
text = text.replace("<image>", image_tokens, 1)
prompt = self.tokenizer(text, return_tensors="pt")
prompt.update({"pixel_values": pixel_values})
return prompt
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
"<IMG_CONTEXT>")
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
hf_model.model.img_context_token_id = img_context_token_id
hf_model.processor = H2OVLProcessor(hf_model)
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language_model.get_output_embeddings()
hf_model.model.generate = types.MethodType(_internvl_generate,
hf_model.model)
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.language_model.get_output_embeddings()
)
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
return hf_model
@@ -467,19 +469,23 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.num_image_token = hf_runner.model.num_image_token
self.tokenizer = hf_runner.tokenizer
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
trust_remote_code=True)
self.config = AutoConfig.from_pretrained(
hf_runner.model_name, trust_remote_code=True
)
self.vision_config = self.config.vision_config
self.use_thumbnail = self.config.use_thumbnail
self.min_num = self.config.min_dynamic_patch
self.max_num = self.config.max_dynamic_patch
self.image_size = self.vision_config.image_size
def __call__(self, text: str, images: Union[Image, list[Image]],
**kwargs):
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
from vllm.model_executor.models.skyworkr1v import (
IMG_CONTEXT, IMG_END, IMG_START,
image_to_pixel_values_skyworkr1v)
IMG_CONTEXT,
IMG_END,
IMG_START,
image_to_pixel_values_skyworkr1v,
)
images = [images] if isinstance(images, Image) else images
pixel_values = [
image_to_pixel_values_skyworkr1v(
@@ -488,29 +494,26 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
min_num=self.min_num,
max_num=self.max_num,
use_thumbnail=self.use_thumbnail,
) for image in images
]
num_patches_list = [
pixel_value.shape[0] for pixel_value in pixel_values
)
for image in images
]
num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
pixel_values = torch.cat(pixel_values, dim=0)
for num_patches in num_patches_list:
context_tokens = IMG_CONTEXT * self.num_image_token \
* num_patches
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
image_tokens = IMG_START + context_tokens + IMG_END
text = text.replace('<image>', image_tokens, 1)
text = text.replace("<image>", image_tokens, 1)
prompt = self.tokenizer(text, return_tensors="pt")
prompt.update({"pixel_values": pixel_values})
return prompt
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
"<IMG_CONTEXT>")
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
hf_model.model.img_context_token_id = img_context_token_id
hf_model.processor = SkyworkR1VProcessor(hf_model)
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language_model.get_output_embeddings()
hf_model.model.generate = types.MethodType(_internvl_generate,
hf_model.model)
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.language_model.get_output_embeddings()
)
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
return hf_model
@@ -524,8 +527,9 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
self.num_image_token = hf_runner.model.num_image_token
self.tokenizer = hf_runner.tokenizer
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
trust_remote_code=True)
self.config = AutoConfig.from_pretrained(
hf_runner.model_name, trust_remote_code=True
)
self.vision_config = self.config.vision_config
self.use_thumbnail = self.config.use_thumbnail
self.min_num = self.config.min_dynamic_patch
@@ -540,8 +544,13 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
**kwargs,
):
from vllm.model_executor.models.internvl import (
IMG_CONTEXT, IMG_END, IMG_START,
image_to_pixel_values_internvl, video_to_pixel_values_internvl)
IMG_CONTEXT,
IMG_END,
IMG_START,
image_to_pixel_values_internvl,
video_to_pixel_values_internvl,
)
images = [images] if isinstance(images, Image) else images
videos = [videos] if isinstance(videos, np.ndarray) else videos
if images is not None:
@@ -552,7 +561,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
min_num=self.min_num,
max_num=self.max_num,
use_thumbnail=self.use_thumbnail,
) for image in images
)
for image in images
]
num_patches_images = [
pixel_value.shape[0] for pixel_value in pixel_values_images
@@ -568,7 +578,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
min_num=1,
max_num=1,
use_thumbnail=False,
) for video in videos
)
for video in videos
]
num_patches_videos = [
pixel_value.shape[0] for pixel_value in pixel_values_videos
@@ -580,38 +591,37 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
while ("<image>" in text) or ("<video>" in text):
image_index = text.find("<image>")
video_index = text.find("<video>")
if image_index == -1 or (video_index > -1
and video_index < image_index):
if image_index == -1 or (
video_index > -1 and video_index < image_index
):
num_patches = num_patches_videos.pop(0)
pixel_values.append(pixel_values_videos.pop(0))
context_tokens = IMG_START + \
IMG_CONTEXT * self.num_image_token + IMG_END
video_tokens = ''.join([
f'Frame{i+1}: {context_tokens}'
for i in range(num_patches)
])
text = text.replace('<video>', video_tokens, 1)
context_tokens = (
IMG_START + IMG_CONTEXT * self.num_image_token + IMG_END
)
video_tokens = "".join(
[f"Frame{i + 1}: {context_tokens}" for i in range(num_patches)]
)
text = text.replace("<video>", video_tokens, 1)
else:
num_patches = num_patches_images.pop(0)
pixel_values.append(pixel_values_images.pop(0))
context_tokens = IMG_CONTEXT * self.num_image_token \
* num_patches
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
image_tokens = IMG_START + context_tokens + IMG_END
text = text.replace('<image>', image_tokens, 1)
text = text.replace("<image>", image_tokens, 1)
pixel_values = torch.cat(pixel_values, dim=0)
prompt = self.tokenizer(text, return_tensors="pt")
prompt.update({"pixel_values": pixel_values})
return prompt
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
"<IMG_CONTEXT>")
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
hf_model.model.img_context_token_id = img_context_token_id
hf_model.processor = InternVLProcessor(hf_model)
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.language_model.get_output_embeddings()
hf_model.model.generate = types.MethodType(_internvl_generate,
hf_model.model)
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.language_model.get_output_embeddings()
)
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
return hf_model
@@ -631,7 +641,7 @@ def _internvl_generate(
input_embeds = input_embeds.reshape(B * N, C)
input_ids = input_ids.reshape(B * N)
selected = (input_ids == self.img_context_token_id)
selected = input_ids == self.img_context_token_id
assert selected.sum() != 0
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
@@ -778,8 +788,9 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.llm.get_output_embeddings()
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.llm.get_output_embeddings()
)
def processor(*args, text="", images=None, **kwargs):
text_tokenizer = hf_model.model.get_text_tokenizer()
@@ -787,8 +798,7 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
prompt_start_and_end = {
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
"llama":
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
"llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
}
for start, end in prompt_start_and_end.values():
@@ -797,7 +807,8 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
break
prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
text_or_conversations=text, images=images)
text_or_conversations=text, images=images
)
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
inputs = {
@@ -813,8 +824,9 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
hf_model.model.get_output_embeddings = lambda: \
hf_model.model.llm.get_output_embeddings()
hf_model.model.get_output_embeddings = (
lambda: hf_model.model.llm.get_output_embeddings()
)
def processor(*args, text="", images=None, videos=None, **kwargs):
if images is None:
@@ -825,13 +837,11 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
videos = []
else:
videos = [videos] if isinstance(videos, np.ndarray) else videos
videos = [[PIL.Image.fromarray(frame) for frame in vid]
for vid in videos]
videos = [[PIL.Image.fromarray(frame) for frame in vid] for vid in videos]
prompt_start_and_end = {
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
"llama":
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
"llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
}
for start, end in prompt_start_and_end.values():
@@ -842,21 +852,20 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
images_message = [{"type": "image", "image": img} for img in images]
videos_message = [{"type": "video", "video": vid} for vid in videos]
messages = [{
"role":
"user",
"content": [
*images_message,
*videos_message,
{
"type": "text",
"text": text
},
],
}]
messages = [
{
"role": "user",
"content": [
*images_message,
*videos_message,
{"type": "text", "text": text},
],
}
]
input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
messages=messages, enable_thinking=True)
messages=messages, enable_thinking=True
)
inputs = {
"inputs": input_ids,
"pixel_values": pixel_values,

View File

@@ -3,23 +3,34 @@
"""Entrypoints for wrapping the core run_test implementation for specific test
types / modalities.
"""
from pathlib import PosixPath
from .....conftest import (AudioTestAssets, HfRunner, ImageTestAssets,
VideoTestAssets, VllmRunner)
from .....conftest import (
AudioTestAssets,
HfRunner,
ImageTestAssets,
VideoTestAssets,
VllmRunner,
)
from . import builders, core
from .types import ExpandableVLMTestArgs, VLMTestInfo
####### Entrypoints for running different test types
def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets):
def run_single_image_test(
*,
tmp_path: PosixPath,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs = builders.build_single_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
model_test_info, image_assets, test_case.size_wrapper, tmp_path
)
core.run_test(
hf_runner=hf_runner,
@@ -31,17 +42,23 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": 1},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets):
def run_multi_image_test(
*,
tmp_path: PosixPath,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs = builders.build_multi_image_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
model_test_info, image_assets, test_case.size_wrapper, tmp_path
)
core.run_test(
hf_runner=hf_runner,
@@ -53,17 +70,22 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"image": len(image_assets)},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_embedding_test(*, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets):
def run_embedding_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
image_assets: ImageTestAssets,
):
assert test_case.size_wrapper is not None
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
model_test_info, image_assets, test_case.size_wrapper)
model_test_info, image_assets, test_case.size_wrapper
)
core.run_test(
hf_runner=hf_runner,
@@ -76,7 +98,8 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
limit_mm_per_prompt={"image": 1},
vllm_embeddings=vllm_embeddings,
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_video_test(
@@ -90,8 +113,11 @@ def run_video_test(
assert test_case.size_wrapper is not None
assert test_case.num_video_frames is not None
inputs = builders.build_video_inputs_from_test_info(
model_test_info, video_assets, test_case.size_wrapper,
test_case.num_video_frames)
model_test_info,
video_assets,
test_case.size_wrapper,
test_case.num_video_frames,
)
core.run_test(
hf_runner=hf_runner,
@@ -103,7 +129,8 @@ def run_video_test(
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"video": len(video_assets)},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_audio_test(
@@ -114,8 +141,7 @@ def run_audio_test(
vllm_runner: type[VllmRunner],
audio_assets: AudioTestAssets,
):
inputs = builders.build_audio_inputs_from_test_info(
model_test_info, audio_assets)
inputs = builders.build_audio_inputs_from_test_info(model_test_info, audio_assets)
core.run_test(
hf_runner=hf_runner,
@@ -127,13 +153,17 @@ def run_audio_test(
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt={"audio": 1},
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)
def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner]):
def run_custom_inputs_test(
*,
model_test_info: VLMTestInfo,
test_case: ExpandableVLMTestArgs,
hf_runner: type[HfRunner],
vllm_runner: type[VllmRunner],
):
# Custom test cases can provide inputs directly, but they need to
# explicitly provided a CustomTestConfig, which wraps the inputs and
# the limit_mm_per_prompt
@@ -155,4 +185,5 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
num_logprobs=test_case.num_logprobs,
limit_mm_per_prompt=limit_mm_per_prompt,
distributed_executor_backend=test_case.distributed_executor_backend,
**model_test_info.get_non_parametrized_runner_kwargs())
**model_test_info.get_non_parametrized_runner_kwargs(),
)

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Types for writing multimodal model tests."""
from collections.abc import Iterable
from enum import Enum
from pathlib import PosixPath
@@ -15,9 +16,16 @@ from vllm.config import RunnerOption
from vllm.logprobs import SampleLogprobs
from vllm.transformers_utils.tokenizer import AnyTokenizer
from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset,
ImageTestAssets, PromptAudioInput, PromptImageInput,
PromptVideoInput)
from .....conftest import (
AUDIO_ASSETS,
IMAGE_ASSETS,
HfRunner,
ImageAsset,
ImageTestAssets,
PromptAudioInput,
PromptImageInput,
PromptVideoInput,
)
from ....utils import check_logprobs_close
# meta image tag; will be replaced by the appropriate tag for the model
@@ -47,6 +55,7 @@ RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
class PromptWithMultiModalInput(NamedTuple):
"""Holds the multimodal input for a single test case."""
prompts: list[str]
image_data: Optional[PromptImageInput] = None
video_data: Optional[PromptVideoInput] = None
@@ -100,8 +109,9 @@ class VLMTestInfo(NamedTuple):
# Function for converting ImageAssets to image embeddings;
# We need to define this explicitly for embedding tests
convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
list[torch.Tensor]]] = None
convert_assets_to_embeddings: Optional[
Callable[[ImageTestAssets], list[torch.Tensor]]
] = None
# Exposed options for vLLM runner; we change these in a several tests,
# but the defaults are derived from VllmRunner & the engine defaults
@@ -156,8 +166,8 @@ class VLMTestInfo(NamedTuple):
# for Qwen-VL, which requires encoding the image path / url into the prompt
# for HF runner
prompt_path_encoder: Optional[
Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]],
str]] = None # noqa: E501
Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]], str]
] = None # noqa: E501
# Allows configuring a test to run with custom inputs
custom_test_opts: Optional[list[CustomTestOptions]] = None
@@ -190,6 +200,7 @@ class VLMTestInfo(NamedTuple):
class ExpandableVLMTestArgs(NamedTuple):
"""The expanded kwargs which correspond to a single test case."""
model: str
max_tokens: int
num_logprobs: int