Convert formatting to use ruff instead of yapf + isort (#26247)
Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
@@ -3,27 +3,40 @@
|
||||
"""Common tests for testing .generate() functionality for single / multiple
|
||||
image, embedding, and video support for different VLMs in vLLM.
|
||||
"""
|
||||
|
||||
import math
|
||||
import os
|
||||
from collections import defaultdict
|
||||
from pathlib import PosixPath
|
||||
|
||||
import pytest
|
||||
from transformers import (AutoModel, AutoModelForImageTextToText,
|
||||
AutoModelForTextToWaveform)
|
||||
from transformers import (
|
||||
AutoModel,
|
||||
AutoModelForImageTextToText,
|
||||
AutoModelForTextToWaveform,
|
||||
)
|
||||
|
||||
from vllm.platforms import current_platform
|
||||
from vllm.utils import identity
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, AudioTestAssets, HfRunner,
|
||||
ImageTestAssets, VideoTestAssets, VllmRunner)
|
||||
from ....utils import (create_new_process_for_each_test, large_gpu_mark,
|
||||
multi_gpu_marks)
|
||||
from ....conftest import (
|
||||
IMAGE_ASSETS,
|
||||
AudioTestAssets,
|
||||
HfRunner,
|
||||
ImageTestAssets,
|
||||
VideoTestAssets,
|
||||
VllmRunner,
|
||||
)
|
||||
from ....utils import create_new_process_for_each_test, large_gpu_mark, multi_gpu_marks
|
||||
from ...utils import check_outputs_equal
|
||||
from .vlm_utils import custom_inputs, model_utils, runners
|
||||
from .vlm_utils.case_filtering import get_parametrized_options
|
||||
from .vlm_utils.types import (CustomTestOptions, ExpandableVLMTestArgs,
|
||||
VLMTestInfo, VLMTestType)
|
||||
from .vlm_utils.types import (
|
||||
CustomTestOptions,
|
||||
ExpandableVLMTestArgs,
|
||||
VLMTestInfo,
|
||||
VLMTestType,
|
||||
)
|
||||
|
||||
# This hack is needed for phi3v & paligemma models
|
||||
# ROCm Triton FA can run into shared memory issues with these models,
|
||||
@@ -828,7 +841,7 @@ def _mark_splits(
|
||||
new_test_settings = dict[str, VLMTestInfo]()
|
||||
|
||||
for i in range(num_groups):
|
||||
models_in_group = models[i * split_size:(i + 1) * split_size]
|
||||
models_in_group = models[i * split_size : (i + 1) * split_size]
|
||||
|
||||
for model in models_in_group:
|
||||
for info in test_infos_by_model[model]:
|
||||
@@ -859,7 +872,8 @@ VLM_TEST_SETTINGS = _mark_splits(VLM_TEST_SETTINGS, num_groups=2)
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.IMAGE,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
),
|
||||
)
|
||||
def test_single_image_models(
|
||||
tmp_path: PosixPath,
|
||||
model_type: str,
|
||||
@@ -885,7 +899,8 @@ def test_single_image_models(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.MULTI_IMAGE,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
),
|
||||
)
|
||||
def test_multi_image_models(
|
||||
tmp_path: PosixPath,
|
||||
model_type: str,
|
||||
@@ -911,7 +926,8 @@ def test_multi_image_models(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.EMBEDDING,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
),
|
||||
)
|
||||
def test_image_embedding_models(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@@ -935,7 +951,8 @@ def test_image_embedding_models(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.VIDEO,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
),
|
||||
)
|
||||
def test_video_models(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@@ -959,7 +976,8 @@ def test_video_models(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.AUDIO,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
),
|
||||
)
|
||||
def test_audio_models(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@@ -983,7 +1001,8 @@ def test_audio_models(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
create_new_process_for_each_test=False,
|
||||
))
|
||||
),
|
||||
)
|
||||
def test_custom_inputs_models(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@@ -1006,7 +1025,8 @@ def test_custom_inputs_models(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.IMAGE,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
),
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_single_image_models_heavy(
|
||||
tmp_path: PosixPath,
|
||||
@@ -1033,7 +1053,8 @@ def test_single_image_models_heavy(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.MULTI_IMAGE,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
),
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_multi_image_models_heavy(
|
||||
tmp_path: PosixPath,
|
||||
@@ -1060,7 +1081,8 @@ def test_multi_image_models_heavy(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.EMBEDDING,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
),
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_image_embedding_models_heavy(
|
||||
model_type: str,
|
||||
@@ -1085,7 +1107,8 @@ def test_image_embedding_models_heavy(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.VIDEO,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
),
|
||||
)
|
||||
def test_video_models_heavy(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@@ -1109,7 +1132,8 @@ def test_video_models_heavy(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.AUDIO,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
),
|
||||
)
|
||||
def test_audio_models_heavy(
|
||||
model_type: str,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
@@ -1133,7 +1157,8 @@ def test_audio_models_heavy(
|
||||
VLM_TEST_SETTINGS,
|
||||
test_type=VLMTestType.CUSTOM_INPUTS,
|
||||
create_new_process_for_each_test=True,
|
||||
))
|
||||
),
|
||||
)
|
||||
@create_new_process_for_each_test()
|
||||
def test_custom_inputs_models_heavy(
|
||||
model_type: str,
|
||||
|
||||
@@ -10,8 +10,7 @@ from transformers import AutoModelForSpeechSeq2Seq
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.lora.request import LoRARequest
|
||||
|
||||
from ....conftest import (AudioTestAssets, HfRunner, PromptAudioInput,
|
||||
VllmRunner)
|
||||
from ....conftest import AudioTestAssets, HfRunner, PromptAudioInput, VllmRunner
|
||||
from ...registry import HF_EXAMPLE_MODELS
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
@@ -64,50 +63,49 @@ def run_test(
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=1,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"audio": 1},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=64,
|
||||
enforce_eager=True,
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=1,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"audio": 1},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=64,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
lora_request = LoRARequest("audio", 1, audio_lora_path)
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=audios,
|
||||
lora_request=lora_request)
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=audios,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
for prompts, audios in inputs
|
||||
]
|
||||
|
||||
with hf_runner(model, dtype=dtype,
|
||||
auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
|
||||
|
||||
with hf_runner(model, dtype=dtype, auto_cls=AutoModelForSpeechSeq2Seq) as hf_model:
|
||||
hf_processor = hf_model.processor
|
||||
eos_token_id = hf_processor.tokenizer.eos_token_id
|
||||
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=[audios],
|
||||
eos_token_id=eos_token_id)
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=[audios],
|
||||
eos_token_id=eos_token_id,
|
||||
)
|
||||
for prompts, audios in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
|
||||
vllm_outputs_per_case):
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=[
|
||||
vllm_to_hf_output(output) for output in vllm_outputs
|
||||
],
|
||||
outputs_1_lst=[vllm_to_hf_output(output) for output in vllm_outputs],
|
||||
name_0="hf",
|
||||
name_1="vllm",
|
||||
)
|
||||
@@ -118,9 +116,16 @@ def run_test(
|
||||
@pytest.mark.parametrize("max_model_len", [2048])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(hf_runner, vllm_runner, model: str,
|
||||
audio_assets: AudioTestAssets, dtype: str, max_model_len: int,
|
||||
max_tokens: int, num_logprobs: int) -> None:
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model: str,
|
||||
audio_assets: AudioTestAssets,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
model_info = HF_EXAMPLE_MODELS.find_hf_info(model)
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
@@ -28,8 +28,7 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
|
||||
give the same result.
|
||||
"""
|
||||
|
||||
image_cherry = convert_image_mode(
|
||||
ImageAsset("cherry_blossom").pil_image, "RGB")
|
||||
image_cherry = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
|
||||
image_stop = convert_image_mode(ImageAsset("stop_sign").pil_image, "RGB")
|
||||
images = [image_cherry, image_stop]
|
||||
video = VideoAsset(name="baby_reading", num_frames=16).np_ndarrays
|
||||
@@ -47,29 +46,30 @@ def test_models(vllm_runner, model, dtype: str, max_tokens: int) -> None:
|
||||
),
|
||||
]
|
||||
|
||||
with vllm_runner(model,
|
||||
runner="generate",
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
max_model_len=32768,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=1,
|
||||
enforce_eager=True) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": 2},
|
||||
max_model_len=32768,
|
||||
max_num_seqs=2,
|
||||
tensor_parallel_size=1,
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy(prompts,
|
||||
max_tokens,
|
||||
images=images,
|
||||
videos=videos)
|
||||
vllm_model.generate_greedy(
|
||||
prompts, max_tokens, images=images, videos=videos
|
||||
)
|
||||
for prompts, images, videos in inputs
|
||||
]
|
||||
|
||||
all_results = [output[0][1] for output in vllm_outputs_per_case]
|
||||
outputs = [(total_str, total_str.find("assistant\n") + len("assistant\n"))
|
||||
for total_str in all_results]
|
||||
prompt_lengths = [prompt_len for _, prompt_len in outputs]
|
||||
generated_strs = [
|
||||
total_str[prompt_len:] for total_str, prompt_len in outputs
|
||||
outputs = [
|
||||
(total_str, total_str.find("assistant\n") + len("assistant\n"))
|
||||
for total_str in all_results
|
||||
]
|
||||
prompt_lengths = [prompt_len for _, prompt_len in outputs]
|
||||
generated_strs = [total_str[prompt_len:] for total_str, prompt_len in outputs]
|
||||
interleaved_prompt_len, noninterleaved_prompt_len = prompt_lengths
|
||||
interleaved_output_str, noninterleaved_output_str = generated_strs
|
||||
|
||||
|
||||
@@ -18,13 +18,11 @@ from typing import Any
|
||||
import pytest
|
||||
import torch
|
||||
from safetensors.torch import save_file
|
||||
from transformers import (AutoConfig, AutoProcessor, AutoTokenizer,
|
||||
GenerationConfig)
|
||||
from transformers import AutoConfig, AutoProcessor, AutoTokenizer, GenerationConfig
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.v1.executor.abstract import Executor
|
||||
from vllm.v1.kv_cache_interface import (ChunkedLocalAttentionSpec,
|
||||
FullAttentionSpec)
|
||||
from vllm.v1.kv_cache_interface import ChunkedLocalAttentionSpec, FullAttentionSpec
|
||||
|
||||
from ....utils import multi_gpu_test
|
||||
|
||||
@@ -93,8 +91,7 @@ def get_rope_layers_config(model_path: str) -> list[int]:
|
||||
|
||||
|
||||
def create_reduced_maverick_model(
|
||||
original_model_name:
|
||||
str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
original_model_name: str = "meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8",
|
||||
output_dir: str = "/tmp/reduced_maverick",
|
||||
text_layers: int = 4,
|
||||
num_experts: int = 4,
|
||||
@@ -118,7 +115,8 @@ def create_reduced_maverick_model(
|
||||
|
||||
print(
|
||||
f"Creating reduced Maverick model with {text_layers} text layers and "
|
||||
f"{vision_layers} vision layers...")
|
||||
f"{vision_layers} vision layers..."
|
||||
)
|
||||
|
||||
# Create output directory
|
||||
output_path = Path(output_dir)
|
||||
@@ -126,19 +124,23 @@ def create_reduced_maverick_model(
|
||||
if force_recreate:
|
||||
shutil.rmtree(output_path)
|
||||
else:
|
||||
print(f"Output directory {output_dir} already exists. "
|
||||
"Use --force-recreate to overwrite.")
|
||||
print(
|
||||
f"Output directory {output_dir} already exists. "
|
||||
"Use --force-recreate to overwrite."
|
||||
)
|
||||
return str(output_path)
|
||||
|
||||
output_path.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
try:
|
||||
print("Loading original model configuration...")
|
||||
original_config = AutoConfig.from_pretrained(original_model_name,
|
||||
trust_remote_code=True)
|
||||
original_config = AutoConfig.from_pretrained(
|
||||
original_model_name, trust_remote_code=True
|
||||
)
|
||||
print("Creating reduced configuration...")
|
||||
reduced_config = create_reduced_config(original_config, text_layers,
|
||||
num_experts, vision_layers)
|
||||
reduced_config = create_reduced_config(
|
||||
original_config, text_layers, num_experts, vision_layers
|
||||
)
|
||||
|
||||
config_path = output_path / "config.json"
|
||||
with open(config_path, "w") as f:
|
||||
@@ -149,8 +151,7 @@ def create_reduced_maverick_model(
|
||||
copy_tokenizer_files(original_model_name, output_path)
|
||||
|
||||
print("Creating reduced safetensors files...")
|
||||
create_reduced_safetensors(original_config, reduced_config,
|
||||
output_path)
|
||||
create_reduced_safetensors(original_config, reduced_config, output_path)
|
||||
|
||||
print("Creating preprocessor config...")
|
||||
create_preprocessor_config(original_config, output_path)
|
||||
@@ -173,9 +174,9 @@ def create_reduced_maverick_model(
|
||||
raise
|
||||
|
||||
|
||||
def create_reduced_config(original_config: Any, text_layers: int,
|
||||
num_experts: int,
|
||||
vision_layers: int) -> dict[str, Any]:
|
||||
def create_reduced_config(
|
||||
original_config: Any, text_layers: int, num_experts: int, vision_layers: int
|
||||
) -> dict[str, Any]:
|
||||
"""Create a reduced configuration based on the original."""
|
||||
|
||||
# Convert config to dictionary
|
||||
@@ -185,23 +186,18 @@ def create_reduced_config(original_config: Any, text_layers: int,
|
||||
if "text_config" in config_dict:
|
||||
original_text_layers = config_dict["text_config"]["num_hidden_layers"]
|
||||
config_dict["text_config"]["num_hidden_layers"] = text_layers
|
||||
print(
|
||||
f"Reduced text layers from {original_text_layers} to {text_layers}"
|
||||
)
|
||||
print(f"Reduced text layers from {original_text_layers} to {text_layers}")
|
||||
|
||||
original_num_experts = config_dict["text_config"]["num_local_experts"]
|
||||
config_dict["text_config"]["num_local_experts"] = num_experts
|
||||
print(
|
||||
f"Reduced num experts from {original_num_experts} to {num_experts}"
|
||||
)
|
||||
print(f"Reduced num experts from {original_num_experts} to {num_experts}")
|
||||
|
||||
hidden_dim_divisor = 4
|
||||
|
||||
original_hidden_size = config_dict["text_config"]["hidden_size"]
|
||||
new_hidden_size = original_hidden_size // hidden_dim_divisor
|
||||
config_dict["text_config"]["hidden_size"] = new_hidden_size
|
||||
print(f"Reduced hidden size from {original_hidden_size} to "
|
||||
f"{new_hidden_size}")
|
||||
print(f"Reduced hidden size from {original_hidden_size} to {new_hidden_size}")
|
||||
|
||||
original_head_dim = config_dict["text_config"]["head_dim"]
|
||||
new_head_dim = original_head_dim // hidden_dim_divisor
|
||||
@@ -210,15 +206,12 @@ def create_reduced_config(original_config: Any, text_layers: int,
|
||||
|
||||
# Reduce vision layers
|
||||
if "vision_config" in config_dict:
|
||||
original_vision_layers = config_dict["vision_config"][
|
||||
"num_hidden_layers"]
|
||||
original_vision_layers = config_dict["vision_config"]["num_hidden_layers"]
|
||||
config_dict["vision_config"]["num_hidden_layers"] = vision_layers
|
||||
print(f"Reduced vision layers from {original_vision_layers} "
|
||||
f"to {vision_layers}")
|
||||
print(f"Reduced vision layers from {original_vision_layers} to {vision_layers}")
|
||||
|
||||
# Update model name to indicate it's a reduced version
|
||||
config_dict["_name_or_path"] = (
|
||||
f"reduced_maverick_{text_layers}t_{vision_layers}v")
|
||||
config_dict["_name_or_path"] = f"reduced_maverick_{text_layers}t_{vision_layers}v"
|
||||
|
||||
return config_dict
|
||||
|
||||
@@ -227,16 +220,16 @@ def copy_tokenizer_files(original_model_name: str, output_path: Path) -> None:
|
||||
"""Copy tokenizer files from the original model."""
|
||||
|
||||
try:
|
||||
tokenizer = AutoTokenizer.from_pretrained(original_model_name,
|
||||
trust_remote_code=True)
|
||||
tokenizer = AutoTokenizer.from_pretrained(
|
||||
original_model_name, trust_remote_code=True
|
||||
)
|
||||
tokenizer.save_pretrained(output_path)
|
||||
print("Tokenizer files copied successfully")
|
||||
except Exception as e:
|
||||
print(f"Warning: Could not copy tokenizer files: {e}")
|
||||
|
||||
|
||||
def create_preprocessor_config(original_config: Any,
|
||||
output_path: Path) -> None:
|
||||
def create_preprocessor_config(original_config: Any, output_path: Path) -> None:
|
||||
"""Create preprocessor_config.json for multimodal model."""
|
||||
|
||||
# Try to load the original preprocessor config
|
||||
@@ -254,9 +247,9 @@ def create_preprocessor_config(original_config: Any,
|
||||
raise
|
||||
|
||||
|
||||
def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
|
||||
Any],
|
||||
output_path: Path) -> None:
|
||||
def create_reduced_safetensors(
|
||||
original_config: Any, reduced_config: dict[str, Any], output_path: Path
|
||||
) -> None:
|
||||
"""Create safetensors files with weights for the reduced model."""
|
||||
|
||||
print("Generating synthetic weights for reduced model...")
|
||||
@@ -279,8 +272,7 @@ def create_reduced_safetensors(original_config: Any, reduced_config: dict[str,
|
||||
save_weights_to_safetensors(weights, output_path)
|
||||
|
||||
|
||||
def create_text_model_weights(
|
||||
text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
|
||||
def create_text_model_weights(text_config: dict[str, Any]) -> dict[str, torch.Tensor]:
|
||||
"""Create synthetic weights for the text model with MoE structure."""
|
||||
|
||||
weights = {}
|
||||
@@ -291,19 +283,18 @@ def create_text_model_weights(
|
||||
intermediate_size_mlp = text_config["intermediate_size_mlp"]
|
||||
num_layers = text_config["num_hidden_layers"]
|
||||
num_attention_heads = text_config["num_attention_heads"]
|
||||
num_key_value_heads = text_config.get("num_key_value_heads",
|
||||
num_attention_heads)
|
||||
num_key_value_heads = text_config.get("num_key_value_heads", num_attention_heads)
|
||||
|
||||
# MoE specific parameters
|
||||
num_experts = text_config.get("num_local_experts")
|
||||
assert (num_experts
|
||||
is not None), "num_local_experts must be specified for MoE"
|
||||
assert num_experts is not None, "num_local_experts must be specified for MoE"
|
||||
|
||||
head_dim = hidden_size // num_attention_heads
|
||||
|
||||
# Embedding layers
|
||||
weights["language_model.model.embed_tokens.weight"] = torch.randn(
|
||||
vocab_size, hidden_size, dtype=torch.float16)
|
||||
vocab_size, hidden_size, dtype=torch.float16
|
||||
)
|
||||
|
||||
# Transformer layers
|
||||
for layer_idx in range(num_layers):
|
||||
@@ -312,95 +303,105 @@ def create_text_model_weights(
|
||||
|
||||
# Self-attention weights (separate q, k, v projections)
|
||||
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
|
||||
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
|
||||
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
|
||||
hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16)
|
||||
hidden_size, num_key_value_heads * head_dim, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
|
||||
num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16)
|
||||
num_key_value_heads * head_dim, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
|
||||
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16)
|
||||
hidden_size, num_attention_heads * head_dim, dtype=torch.bfloat16
|
||||
)
|
||||
print("Self-attention weights created.")
|
||||
|
||||
# Feed-forward weights - MoE pattern based on interleave_moe_layer_step
|
||||
# For interleave_moe_layer_step=2: layers 1,3,5,... are MoE, layers
|
||||
# 0,2,4,... are dense
|
||||
interleave_step = text_config.get("interleave_moe_layer_step", 1)
|
||||
is_moe_layer = (interleave_step > 0
|
||||
and (layer_idx + 1) % interleave_step == 0)
|
||||
is_moe_layer = interleave_step > 0 and (layer_idx + 1) % interleave_step == 0
|
||||
|
||||
if is_moe_layer:
|
||||
# MoE layer structure
|
||||
# 1. Router weights
|
||||
weights[
|
||||
f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
|
||||
num_experts, hidden_size, dtype=torch.float16)
|
||||
weights[f"{layer_prefix}.feed_forward.router.weight"] = torch.randn(
|
||||
num_experts, hidden_size, dtype=torch.float16
|
||||
)
|
||||
|
||||
# 2. Individual expert weights (not fused)
|
||||
for expert_idx in range(num_experts):
|
||||
expert_prefix = (
|
||||
f"{layer_prefix}.feed_forward.experts.{expert_idx}")
|
||||
expert_prefix = f"{layer_prefix}.feed_forward.experts.{expert_idx}"
|
||||
|
||||
weights[f"{expert_prefix}.gate_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16)
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{expert_prefix}.up_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16)
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{expert_prefix}.down_proj.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16)
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
# Expert weight scales (FP8 quantization)
|
||||
weights[
|
||||
f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
|
||||
intermediate_size, 1, dtype=torch.bfloat16)
|
||||
weights[f"{expert_prefix}.gate_proj.weight_scale"] = torch.ones(
|
||||
intermediate_size, 1, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{expert_prefix}.up_proj.weight_scale"] = torch.ones(
|
||||
intermediate_size, 1, dtype=torch.bfloat16)
|
||||
weights[
|
||||
f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
|
||||
hidden_size, 1, dtype=torch.bfloat16)
|
||||
intermediate_size, 1, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{expert_prefix}.down_proj.weight_scale"] = torch.ones(
|
||||
hidden_size, 1, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
# 3. Shared expert weights
|
||||
shared_expert_prefix = f"{layer_prefix}.feed_forward.shared_expert"
|
||||
weights[f"{shared_expert_prefix}.gate_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16)
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{shared_expert_prefix}.up_proj.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16)
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{shared_expert_prefix}.down_proj.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16)
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16
|
||||
)
|
||||
print(f"MoE feed-forward weights created for layer {layer_idx}.")
|
||||
else:
|
||||
# Dense layer structure
|
||||
weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = (
|
||||
torch.randn(intermediate_size_mlp,
|
||||
hidden_size,
|
||||
dtype=torch.bfloat16))
|
||||
weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = (
|
||||
torch.randn(intermediate_size_mlp,
|
||||
hidden_size,
|
||||
dtype=torch.bfloat16))
|
||||
weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = (
|
||||
torch.randn(hidden_size,
|
||||
intermediate_size_mlp,
|
||||
dtype=torch.bfloat16))
|
||||
weights[f"{layer_prefix}.feed_forward.gate_proj.weight"] = torch.randn(
|
||||
intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.feed_forward.up_proj.weight"] = torch.randn(
|
||||
intermediate_size_mlp, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.feed_forward.down_proj.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size_mlp, dtype=torch.bfloat16
|
||||
)
|
||||
print(f"Dense feed-forward weights created for layer {layer_idx}.")
|
||||
|
||||
# Layer norms
|
||||
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
weights[
|
||||
f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
print("Layer norms created.")
|
||||
|
||||
# Final layer norm and output projection
|
||||
weights["language_model.model.norm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights["language_model.lm_head.weight"] = torch.randn(
|
||||
vocab_size, hidden_size, dtype=torch.bfloat16)
|
||||
vocab_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def create_vision_model_weights(
|
||||
vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
|
||||
vision_config: dict[str, Any],
|
||||
) -> dict[str, torch.Tensor]:
|
||||
"""Create synthetic weights for the vision model."""
|
||||
|
||||
weights = {}
|
||||
@@ -414,47 +415,62 @@ def create_vision_model_weights(
|
||||
layer_prefix = f"vision_model.model.layers.{layer_idx}"
|
||||
|
||||
weights[f"{layer_prefix}.self_attn.q_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.q_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.k_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.k_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.v_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.v_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.o_proj.weight"] = torch.randn(
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.self_attn.o_proj.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
weights[f"{layer_prefix}.mlp.fc1.weight"] = torch.randn(
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16)
|
||||
intermediate_size, hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.mlp.fc1.bias"] = torch.zeros(
|
||||
intermediate_size, dtype=torch.bfloat16)
|
||||
intermediate_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.mlp.fc2.weight"] = torch.randn(
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16)
|
||||
hidden_size, intermediate_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.mlp.fc2.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
weights[f"{layer_prefix}.input_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.input_layernorm.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
weights[
|
||||
f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.post_attention_layernorm.weight"] = torch.ones(
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
weights[f"{layer_prefix}.post_attention_layernorm.bias"] = torch.zeros(
|
||||
hidden_size, dtype=torch.bfloat16)
|
||||
hidden_size, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def create_shared_weights(
|
||||
text_config: dict[str, Any],
|
||||
vision_config: dict[str, Any]) -> dict[str, torch.Tensor]:
|
||||
text_config: dict[str, Any], vision_config: dict[str, Any]
|
||||
) -> dict[str, torch.Tensor]:
|
||||
"""Create weights for shared components (vision-language connector)"""
|
||||
|
||||
weights = {}
|
||||
@@ -464,13 +480,15 @@ def create_shared_weights(
|
||||
|
||||
# Vision-language connector (projects vision features to text space)
|
||||
weights["multi_modal_projector.linear_1.weight"] = torch.randn(
|
||||
text_hidden_size, projector_input_dim, dtype=torch.bfloat16)
|
||||
text_hidden_size, projector_input_dim, dtype=torch.bfloat16
|
||||
)
|
||||
|
||||
return weights
|
||||
|
||||
|
||||
def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
|
||||
output_path: Path) -> None:
|
||||
def save_weights_to_safetensors(
|
||||
weights: dict[str, torch.Tensor], output_path: Path
|
||||
) -> None:
|
||||
"""Save weights to safetensors files and create index."""
|
||||
|
||||
# Determine how to shard the weights
|
||||
@@ -507,18 +525,18 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
|
||||
else:
|
||||
# Multiple shards
|
||||
for i, shard in enumerate(shards):
|
||||
filename = f"model-{i+1:05d}-of-{len(shards):05d}.safetensors"
|
||||
filename = f"model-{i + 1:05d}-of-{len(shards):05d}.safetensors"
|
||||
save_file(shard, output_path / filename)
|
||||
for name in shard:
|
||||
weight_map[name] = filename
|
||||
print(f"Saved shard {i+1}/{len(shards)}: {filename}")
|
||||
print(f"Saved shard {i + 1}/{len(shards)}: {filename}")
|
||||
|
||||
# Create index file
|
||||
index_data = {
|
||||
"metadata": {
|
||||
"total_size":
|
||||
sum(tensor.numel() * tensor.element_size()
|
||||
for tensor in weights.values())
|
||||
"total_size": sum(
|
||||
tensor.numel() * tensor.element_size() for tensor in weights.values()
|
||||
)
|
||||
},
|
||||
"weight_map": weight_map,
|
||||
}
|
||||
@@ -528,8 +546,9 @@ def save_weights_to_safetensors(weights: dict[str, torch.Tensor],
|
||||
json.dump(index_data, f, indent=2)
|
||||
|
||||
print(f"Created index file: {index_path}")
|
||||
print(f"Total model size: "
|
||||
f"{index_data['metadata']['total_size'] / (1024**3):.2f} GB")
|
||||
print(
|
||||
f"Total model size: {index_data['metadata']['total_size'] / (1024**3):.2f} GB"
|
||||
)
|
||||
|
||||
|
||||
def check_attention_spec_interleaved_rope(
|
||||
@@ -540,8 +559,7 @@ def check_attention_spec_interleaved_rope(
|
||||
):
|
||||
"""Check that the attention spec is correct."""
|
||||
assert isinstance(llm.llm_engine.model_executor, Executor)
|
||||
kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs(
|
||||
)
|
||||
kv_cache_specs_per_rank = llm.llm_engine.model_executor.get_kv_cache_specs()
|
||||
for rank in range(num_ranks):
|
||||
kv_cache_specs = kv_cache_specs_per_rank[rank]
|
||||
assert len(kv_cache_specs.keys()) == num_attention_layers
|
||||
@@ -551,16 +569,14 @@ def check_attention_spec_interleaved_rope(
|
||||
else:
|
||||
expected_spec = ChunkedLocalAttentionSpec
|
||||
assert isinstance(
|
||||
kv_cache_specs[
|
||||
f"language_model.model.layers.{i}.self_attn.attn"],
|
||||
expected_spec)
|
||||
kv_cache_specs[f"language_model.model.layers.{i}.self_attn.attn"],
|
||||
expected_spec,
|
||||
)
|
||||
|
||||
|
||||
def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
|
||||
"""Test the created reduced model with vLLM."""
|
||||
sampling_params = SamplingParams(temperature=0.8,
|
||||
top_p=0.95,
|
||||
max_tokens=50)
|
||||
sampling_params = SamplingParams(temperature=0.8, top_p=0.95, max_tokens=50)
|
||||
|
||||
if should_profile:
|
||||
llm.start_profile()
|
||||
@@ -571,15 +587,15 @@ def run_reduced_model(llm: LLM, should_profile: bool = False) -> None:
|
||||
print("Test generation successful!")
|
||||
for output in outputs:
|
||||
print(f"Prompt: {output.prompt}")
|
||||
print(f"Output: "
|
||||
f"{output.outputs[0].text}")
|
||||
print(f"Output: {output.outputs[0].text}")
|
||||
print("-" * 40)
|
||||
|
||||
|
||||
@multi_gpu_test(num_gpus=2)
|
||||
@pytest.mark.parametrize(
|
||||
"original_model_name,text_layers,num_experts,vision_layers,",
|
||||
[("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)])
|
||||
[("meta-llama/Llama-4-Maverick-17B-128E-Instruct-FP8", 4, 4, 2)],
|
||||
)
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
@pytest.mark.parametrize("tp,ep", [(2, True)])
|
||||
@pytest.mark.skipif(not torch.cuda.is_available(), reason="CUDA not available")
|
||||
@@ -640,7 +656,8 @@ def main():
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Create a reduced-layer Maverick model")
|
||||
description="Create a reduced-layer Maverick model"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--output-dir",
|
||||
default="/tmp/reduced_maverick",
|
||||
@@ -652,10 +669,7 @@ def main():
|
||||
default=4,
|
||||
help="Number of text transformer layers",
|
||||
)
|
||||
parser.add_argument("--num-experts",
|
||||
type=int,
|
||||
default=4,
|
||||
help="Number of experts")
|
||||
parser.add_argument("--num-experts", type=int, default=4, help="Number of experts")
|
||||
parser.add_argument(
|
||||
"--vision-layers",
|
||||
type=int,
|
||||
@@ -667,12 +681,12 @@ def main():
|
||||
action="store_true",
|
||||
help="Force recreation if output directory exists",
|
||||
)
|
||||
parser.add_argument("--test",
|
||||
action="store_true",
|
||||
help="Test the created model with vLLM")
|
||||
parser.add_argument("--profile",
|
||||
action="store_true",
|
||||
help="Profile the created model with vLLM")
|
||||
parser.add_argument(
|
||||
"--test", action="store_true", help="Test the created model with vLLM"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--profile", action="store_true", help="Profile the created model with vLLM"
|
||||
)
|
||||
parser.add_argument(
|
||||
"--test-original",
|
||||
action="store_true",
|
||||
@@ -687,16 +701,18 @@ def main():
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.test:
|
||||
test_dummy_maverick(original_model_name=args.original_model,
|
||||
output_dir=args.output_dir,
|
||||
text_layers=args.text_layers,
|
||||
num_experts=args.num_experts,
|
||||
vision_layers=args.vision_layers,
|
||||
force_recreate=args.force_recreate,
|
||||
tp=2,
|
||||
ep=True,
|
||||
enforce_eager=True,
|
||||
profile=args.profile)
|
||||
test_dummy_maverick(
|
||||
original_model_name=args.original_model,
|
||||
output_dir=args.output_dir,
|
||||
text_layers=args.text_layers,
|
||||
num_experts=args.num_experts,
|
||||
vision_layers=args.vision_layers,
|
||||
force_recreate=args.force_recreate,
|
||||
tp=2,
|
||||
ep=True,
|
||||
enforce_eager=True,
|
||||
profile=args.profile,
|
||||
)
|
||||
|
||||
if args.test_original:
|
||||
run_maverick_serving(args.original_model)
|
||||
|
||||
@@ -14,26 +14,35 @@ from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
|
||||
PromptImageInput, VllmRunner)
|
||||
from ....conftest import (
|
||||
IMAGE_ASSETS,
|
||||
HfRunner,
|
||||
PromptAudioInput,
|
||||
PromptImageInput,
|
||||
VllmRunner,
|
||||
)
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
"cherry_blossom":
|
||||
"<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
})
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<|user|>\n<|image|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
"cherry_blossom": "<|user|>\n<|image|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
}
|
||||
)
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = (
|
||||
"<|user|>\n<|image|>\n<|image|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
|
||||
)
|
||||
|
||||
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct",
|
||||
revision="refs/pr/70")
|
||||
model_path = snapshot_download(
|
||||
"microsoft/Phi-4-multimodal-instruct", revision="refs/pr/70"
|
||||
)
|
||||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||
# we have to manually specify the path of the lora weights.
|
||||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||
speech_question = os.path.join(model_path, "examples",
|
||||
"what_is_shown_in_this_image.wav")
|
||||
speech_question = os.path.join(
|
||||
model_path, "examples", "what_is_shown_in_this_image.wav"
|
||||
)
|
||||
models = [model_path]
|
||||
|
||||
target_dtype = "half"
|
||||
@@ -48,8 +57,7 @@ if current_platform.is_rocm():
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: Sequence[tuple[list[str], PromptImageInput,
|
||||
Optional[PromptAudioInput]]],
|
||||
inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]],
|
||||
model: str,
|
||||
*,
|
||||
max_model_len: int,
|
||||
@@ -75,28 +83,30 @@ def run_test(
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
task="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
|
||||
enforce_eager=True,
|
||||
trust_remote_code=False,
|
||||
model,
|
||||
task="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
|
||||
enforce_eager=True,
|
||||
trust_remote_code=False,
|
||||
) as vllm_model:
|
||||
lora_request = LoRARequest("vision", 1, vision_lora_path)
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
lora_request=lora_request)
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
@@ -108,17 +118,18 @@ def run_test(
|
||||
hf_processor = hf_model.processor
|
||||
eos_token_id = hf_processor.tokenizer.eos_token_id
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
eos_token_id=eos_token_id)
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
eos_token_id=eos_token_id,
|
||||
)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
|
||||
vllm_outputs_per_case):
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
@@ -145,16 +156,27 @@ def run_test(
|
||||
@pytest.mark.parametrize("max_model_len", [12800])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
dtype: str, max_model_len: int, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
None,
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
inputs_per_image = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
None,
|
||||
)
|
||||
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
@@ -189,16 +211,26 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
@pytest.mark.parametrize("max_model_len", [25600])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||
size_factors, dtype: str, max_model_len: int,
|
||||
max_tokens: int, num_logprobs: int) -> None:
|
||||
def test_multi_images_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case = [
|
||||
(
|
||||
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||
[[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors],
|
||||
[
|
||||
[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors
|
||||
],
|
||||
None,
|
||||
),
|
||||
]
|
||||
@@ -222,10 +254,15 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||
@pytest.mark.parametrize("max_model_len", [12800])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
|
||||
max_model_len: int, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
|
||||
def test_vision_speech_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
# use the example speech question so that the model outputs are reasonable
|
||||
audio = librosa.load(speech_question, sr=16000)
|
||||
image = ImageAsset("cherry_blossom").pil_image.convert("RGB")
|
||||
|
||||
@@ -17,31 +17,39 @@ from vllm.lora.request import LoRARequest
|
||||
from vllm.multimodal.image import convert_image_mode, rescale_image_size
|
||||
from vllm.platforms import current_platform
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, HfRunner, PromptAudioInput,
|
||||
PromptImageInput, VllmRunner)
|
||||
from ....conftest import (
|
||||
IMAGE_ASSETS,
|
||||
HfRunner,
|
||||
PromptAudioInput,
|
||||
PromptImageInput,
|
||||
VllmRunner,
|
||||
)
|
||||
from ....utils import large_gpu_test
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
"<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
"cherry_blossom":
|
||||
"<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
})
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = "<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
|
||||
HF_IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": "<|user|>\n<|image_1|>\nWhat's the content of the image?<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
"cherry_blossom": "<|user|>\n<|image_1|>\nPlease infer the season with reason in details.<|end|>\n<|assistant|>\n", # noqa: E501
|
||||
}
|
||||
)
|
||||
HF_MULTIIMAGE_IMAGE_PROMPT = (
|
||||
"<|user|>\n<|image_1|>\n<|image_2|>\nDescribe these images.<|end|>\n<|assistant|>\n" # noqa: E501
|
||||
)
|
||||
|
||||
model_path = snapshot_download("microsoft/Phi-4-multimodal-instruct")
|
||||
# Since the vision-lora and speech-lora co-exist with the base model,
|
||||
# we have to manually specify the path of the lora weights.
|
||||
vision_lora_path = os.path.join(model_path, "vision-lora")
|
||||
speech_question = os.path.join(model_path, "examples",
|
||||
"what_is_shown_in_this_image.wav")
|
||||
speech_question = os.path.join(
|
||||
model_path, "examples", "what_is_shown_in_this_image.wav"
|
||||
)
|
||||
models = [model_path]
|
||||
|
||||
|
||||
def vllm_to_hf_output(vllm_output: tuple[list[int], str,
|
||||
Optional[SampleLogprobs]],
|
||||
model: str):
|
||||
def vllm_to_hf_output(
|
||||
vllm_output: tuple[list[int], str, Optional[SampleLogprobs]], model: str
|
||||
):
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -71,8 +79,7 @@ if current_platform.is_rocm():
|
||||
def run_test(
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
inputs: Sequence[tuple[list[str], PromptImageInput,
|
||||
Optional[PromptAudioInput]]],
|
||||
inputs: Sequence[tuple[list[str], PromptImageInput, Optional[PromptAudioInput]]],
|
||||
model: str,
|
||||
*,
|
||||
max_model_len: int,
|
||||
@@ -98,27 +105,29 @@ def run_test(
|
||||
# will hurt multiprocessing backend with fork method (the default method).
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
|
||||
enforce_eager=True,
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enable_lora=True,
|
||||
max_lora_rank=320,
|
||||
gpu_memory_utilization=0.8, # set to 0.8 to avoid OOM in CI
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
lora_request = LoRARequest("vision", 1, vision_lora_path)
|
||||
vllm_outputs_per_case = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
lora_request=lora_request)
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
lora_request=lora_request,
|
||||
)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
@@ -127,42 +136,36 @@ def run_test(
|
||||
pytest.skip("HF impl is not compatible with current transformers")
|
||||
|
||||
hf_model_kwargs = {"_attn_implementation": "sdpa"}
|
||||
with hf_runner(model, dtype=dtype,
|
||||
model_kwargs=hf_model_kwargs) as hf_model:
|
||||
|
||||
with hf_runner(model, dtype=dtype, model_kwargs=hf_model_kwargs) as hf_model:
|
||||
hf_processor = hf_model.processor
|
||||
eos_token_id = hf_processor.tokenizer.eos_token_id
|
||||
|
||||
def patch_hf_processor(*args,
|
||||
text="",
|
||||
images=None,
|
||||
audio=None,
|
||||
sampling_rate=None,
|
||||
**kwargs):
|
||||
def patch_hf_processor(
|
||||
*args, text="", images=None, audio=None, sampling_rate=None, **kwargs
|
||||
):
|
||||
audios = None
|
||||
if audio is not None and sampling_rate is not None:
|
||||
audios = [(audio, sampling_rate)]
|
||||
return hf_processor(*args,
|
||||
text=text,
|
||||
images=images,
|
||||
audios=audios,
|
||||
**kwargs)
|
||||
return hf_processor(
|
||||
*args, text=text, images=images, audios=audios, **kwargs
|
||||
)
|
||||
|
||||
hf_model.processor = patch_hf_processor
|
||||
|
||||
hf_outputs_per_case = [
|
||||
hf_model.generate_greedy_logprobs_limit(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
eos_token_id=eos_token_id,
|
||||
num_logits_to_keep=0)
|
||||
hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images,
|
||||
audios=audios,
|
||||
eos_token_id=eos_token_id,
|
||||
num_logits_to_keep=0,
|
||||
)
|
||||
for prompts, images, audios in inputs
|
||||
]
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case,
|
||||
vllm_outputs_per_case):
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_case, vllm_outputs_per_case):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=hf_outputs,
|
||||
outputs_1_lst=vllm_outputs,
|
||||
@@ -189,16 +192,27 @@ def run_test(
|
||||
@pytest.mark.parametrize("max_model_len", [12800])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
dtype: str, max_model_len: int, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
def test_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_image = [(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
None,
|
||||
) for image, prompt in zip(images, HF_IMAGE_PROMPTS)]
|
||||
inputs_per_image = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
None,
|
||||
)
|
||||
for image, prompt in zip(images, HF_IMAGE_PROMPTS)
|
||||
]
|
||||
|
||||
run_test(
|
||||
hf_runner,
|
||||
@@ -233,16 +247,26 @@ def test_models(hf_runner, vllm_runner, image_assets, model, size_factors,
|
||||
@pytest.mark.parametrize("max_model_len", [25600])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||
size_factors, dtype: str, max_model_len: int,
|
||||
max_tokens: int, num_logprobs: int) -> None:
|
||||
def test_multi_images_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case = [
|
||||
(
|
||||
[HF_MULTIIMAGE_IMAGE_PROMPT for _ in size_factors],
|
||||
[[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors],
|
||||
[
|
||||
[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors
|
||||
],
|
||||
None,
|
||||
),
|
||||
]
|
||||
@@ -266,10 +290,15 @@ def test_multi_images_models(hf_runner, vllm_runner, image_assets, model,
|
||||
@pytest.mark.parametrize("max_model_len", [12800])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_vision_speech_models(hf_runner, vllm_runner, model, dtype: str,
|
||||
max_model_len: int, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
|
||||
def test_vision_speech_models(
|
||||
hf_runner,
|
||||
vllm_runner,
|
||||
model,
|
||||
dtype: str,
|
||||
max_model_len: int,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
# use the example speech question so that the model outputs are reasonable
|
||||
audio = librosa.load(speech_question, sr=None)
|
||||
image = convert_image_mode(ImageAsset("cherry_blossom").pil_image, "RGB")
|
||||
|
||||
@@ -37,33 +37,33 @@ PROMPT = "Describe each image in one short sentence."
|
||||
|
||||
|
||||
def _create_msg_format(urls: list[str]) -> list[dict[str, Any]]:
|
||||
return [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"text": PROMPT,
|
||||
}] + [{
|
||||
"type": "image_url",
|
||||
"image_url": {
|
||||
"url": url
|
||||
}
|
||||
} for url in urls],
|
||||
}]
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"text": PROMPT,
|
||||
}
|
||||
]
|
||||
+ [{"type": "image_url", "image_url": {"url": url}} for url in urls],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _create_msg_format_hf(urls: list[str]) -> list[dict[str, Any]]:
|
||||
return [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [{
|
||||
"type": "text",
|
||||
"content": PROMPT,
|
||||
}, *({
|
||||
"type": "image",
|
||||
"image": download_image(url)
|
||||
} for url in urls)],
|
||||
}]
|
||||
return [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
{
|
||||
"type": "text",
|
||||
"content": PROMPT,
|
||||
},
|
||||
*({"type": "image", "image": download_image(url)} for url in urls),
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
|
||||
def _create_engine_inputs(urls: list[str]) -> TokensPrompt:
|
||||
@@ -125,11 +125,17 @@ def _dump_outputs_w_logprobs(
|
||||
outputs: OutputsLogprobs,
|
||||
filename: "StrPath",
|
||||
) -> None:
|
||||
json_data = [(tokens, text, [{
|
||||
k: asdict(v)
|
||||
for k, v in token_logprobs.items()
|
||||
} for token_logprobs in (logprobs or [])])
|
||||
for tokens, text, logprobs in outputs]
|
||||
json_data = [
|
||||
(
|
||||
tokens,
|
||||
text,
|
||||
[
|
||||
{k: asdict(v) for k, v in token_logprobs.items()}
|
||||
for token_logprobs in (logprobs or [])
|
||||
],
|
||||
)
|
||||
for tokens, text, logprobs in outputs
|
||||
]
|
||||
|
||||
with open(filename, "w") as f:
|
||||
json.dump(json_data, f)
|
||||
@@ -139,28 +145,35 @@ def load_outputs_w_logprobs(filename: "StrPath") -> OutputsLogprobs:
|
||||
with open(filename, "rb") as f:
|
||||
json_data = json.load(f)
|
||||
|
||||
return [(tokens, text, [{
|
||||
int(k): Logprob(**v)
|
||||
for k, v in token_logprobs.items()
|
||||
} for token_logprobs in logprobs]) for tokens, text, logprobs in json_data]
|
||||
return [
|
||||
(
|
||||
tokens,
|
||||
text,
|
||||
[
|
||||
{int(k): Logprob(**v) for k, v in token_logprobs.items()}
|
||||
for token_logprobs in logprobs
|
||||
],
|
||||
)
|
||||
for tokens, text, logprobs in json_data
|
||||
]
|
||||
|
||||
|
||||
@large_gpu_test(min_gb=80)
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_model_len", MAX_MODEL_LEN)
|
||||
@pytest.mark.parametrize("dtype", ["bfloat16"])
|
||||
def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
|
||||
local_asset_server) -> None:
|
||||
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(
|
||||
FIXTURE_LOGPROBS_CHAT[model])
|
||||
def test_chat(
|
||||
vllm_runner, max_model_len: int, model: str, dtype: str, local_asset_server
|
||||
) -> None:
|
||||
EXPECTED_CHAT_LOGPROBS = load_outputs_w_logprobs(FIXTURE_LOGPROBS_CHAT[model])
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
load_format="mistral",
|
||||
config_format="mistral",
|
||||
max_model_len=max_model_len,
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
model,
|
||||
dtype=dtype,
|
||||
tokenizer_mode="mistral",
|
||||
load_format="mistral",
|
||||
config_format="mistral",
|
||||
max_model_len=max_model_len,
|
||||
limit_mm_per_prompt=LIMIT_MM_PER_PROMPT,
|
||||
) as vllm_model:
|
||||
outputs = []
|
||||
|
||||
@@ -180,7 +193,9 @@ def test_chat(vllm_runner, max_model_len: int, model: str, dtype: str,
|
||||
for i in range(len(logprobs)):
|
||||
assert logprobs[i][-1] is None
|
||||
logprobs[i] = logprobs[i][:-1]
|
||||
check_logprobs_close(outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
|
||||
outputs_1_lst=logprobs,
|
||||
name_0="h100_ref",
|
||||
name_1="output")
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=EXPECTED_CHAT_LOGPROBS,
|
||||
outputs_1_lst=logprobs,
|
||||
name_0="h100_ref",
|
||||
name_1="output",
|
||||
)
|
||||
|
||||
@@ -17,14 +17,15 @@ def qwen2_5_vl_chat_template(*query):
|
||||
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||
|
||||
|
||||
VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
|
||||
"baby_reading":
|
||||
qwen2_5_vl_chat_template(
|
||||
VIDEO_PLACEHOLDER,
|
||||
"Describe this video with a short sentence ",
|
||||
"(no more than 20 words)",
|
||||
),
|
||||
})
|
||||
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
|
||||
{
|
||||
"baby_reading": qwen2_5_vl_chat_template(
|
||||
VIDEO_PLACEHOLDER,
|
||||
"Describe this video with a short sentence ",
|
||||
"(no more than 20 words)",
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
|
||||
@pytest.mark.core_model
|
||||
@@ -33,10 +34,15 @@ VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
|
||||
@pytest.mark.parametrize("num_frames", [16])
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
|
||||
video_pruning_rate: float,
|
||||
num_frames: int, dtype: str,
|
||||
max_tokens: int) -> None:
|
||||
def test_qwen2_5_vl_evs_functionality(
|
||||
vllm_runner,
|
||||
video_assets,
|
||||
model,
|
||||
video_pruning_rate: float,
|
||||
num_frames: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
"""Test EVS (Efficient Video Sampling) functionality with different
|
||||
pruning rates.
|
||||
"""
|
||||
@@ -51,19 +57,18 @@ def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
|
||||
videos = [sampled_vids[0]]
|
||||
|
||||
# Initialize model with EVS configuration
|
||||
with vllm_runner(model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=1,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
tensor_parallel_size=1,
|
||||
video_pruning_rate=video_pruning_rate) as vllm_model:
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=1,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"video": 1},
|
||||
tensor_parallel_size=1,
|
||||
video_pruning_rate=video_pruning_rate,
|
||||
) as vllm_model:
|
||||
# Generate output - this should not crash
|
||||
outputs = vllm_model.generate_greedy(prompts,
|
||||
max_tokens,
|
||||
videos=videos)
|
||||
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
|
||||
|
||||
# Basic validation that we got a response
|
||||
assert len(outputs) == 1
|
||||
@@ -83,10 +88,15 @@ def test_qwen2_5_vl_evs_functionality(vllm_runner, video_assets, model,
|
||||
@pytest.mark.parametrize("num_frames", [16])
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
|
||||
video_pruning_rate: float,
|
||||
num_frames: int, dtype: str,
|
||||
max_tokens: int) -> None:
|
||||
def test_qwen2_5_vl_evs_batched_videos(
|
||||
vllm_runner,
|
||||
video_assets,
|
||||
model,
|
||||
video_pruning_rate: float,
|
||||
num_frames: int,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
"""Test EVS functionality with batched videos.
|
||||
|
||||
This test validates that:
|
||||
@@ -102,23 +112,21 @@ def test_qwen2_5_vl_evs_batched_videos(vllm_runner, video_assets, model,
|
||||
|
||||
# Test batched videos
|
||||
prompts = [VIDEO_PROMPTS[0], VIDEO_PROMPTS[0]]
|
||||
videos = [sampled_vids[0],
|
||||
sampled_vids[0]] # Use same video twice for testing
|
||||
videos = [sampled_vids[0], sampled_vids[0]] # Use same video twice for testing
|
||||
|
||||
# Initialize model with EVS configuration
|
||||
with vllm_runner(model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"video": 2},
|
||||
tensor_parallel_size=1,
|
||||
video_pruning_rate=video_pruning_rate) as vllm_model:
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=2,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"video": 2},
|
||||
tensor_parallel_size=1,
|
||||
video_pruning_rate=video_pruning_rate,
|
||||
) as vllm_model:
|
||||
# Generate output - this should not crash
|
||||
outputs = vllm_model.generate_greedy(prompts,
|
||||
max_tokens,
|
||||
videos=videos)
|
||||
outputs = vllm_model.generate_greedy(prompts, max_tokens, videos=videos)
|
||||
|
||||
# Basic validation that we got responses for both videos
|
||||
assert len(outputs) == 2
|
||||
|
||||
@@ -11,8 +11,13 @@ from PIL import Image
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import rescale_video_size, sample_frames_from_video
|
||||
|
||||
from ....conftest import (IMAGE_ASSETS, VIDEO_ASSETS, PromptImageInput,
|
||||
PromptVideoInput, VllmRunner)
|
||||
from ....conftest import (
|
||||
IMAGE_ASSETS,
|
||||
VIDEO_ASSETS,
|
||||
PromptImageInput,
|
||||
PromptVideoInput,
|
||||
VllmRunner,
|
||||
)
|
||||
from ...utils import check_logprobs_close
|
||||
|
||||
|
||||
@@ -34,28 +39,29 @@ def qwen2_vl_chat_template(*query):
|
||||
return f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n{''.join(query)}<|im_end|><|im_start|>assistant\n" # noqa: E501
|
||||
|
||||
|
||||
IMAGE_PROMPTS = IMAGE_ASSETS.prompts({
|
||||
"stop_sign":
|
||||
qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
"What is the biggest text's content in this image?",
|
||||
),
|
||||
"cherry_blossom":
|
||||
qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
"What is the season shown in this image? ",
|
||||
"Reply with a short sentence (no more than 20 words)",
|
||||
),
|
||||
})
|
||||
IMAGE_PROMPTS = IMAGE_ASSETS.prompts(
|
||||
{
|
||||
"stop_sign": qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
"What is the biggest text's content in this image?",
|
||||
),
|
||||
"cherry_blossom": qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
"What is the season shown in this image? ",
|
||||
"Reply with a short sentence (no more than 20 words)",
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
VIDEO_PROMPTS = VIDEO_ASSETS.prompts({
|
||||
"baby_reading":
|
||||
qwen2_vl_chat_template(
|
||||
VIDEO_PLACEHOLDER,
|
||||
"Describe this video with a short sentence ",
|
||||
"(no more than 20 words)",
|
||||
),
|
||||
})
|
||||
VIDEO_PROMPTS = VIDEO_ASSETS.prompts(
|
||||
{
|
||||
"baby_reading": qwen2_vl_chat_template(
|
||||
VIDEO_PLACEHOLDER,
|
||||
"Describe this video with a short sentence ",
|
||||
"(no more than 20 words)",
|
||||
),
|
||||
}
|
||||
)
|
||||
|
||||
MULTIIMAGE_PROMPT = qwen2_vl_chat_template(
|
||||
IMAGE_PLACEHOLDER,
|
||||
@@ -77,17 +83,19 @@ class Qwen2VLPromptVideoEmbeddingInput(TypedDict):
|
||||
|
||||
|
||||
def batch_make_image_embeddings(
|
||||
image_batches: list[Union[Image.Image, list[Image.Image]]], processor,
|
||||
llm: VllmRunner) -> list[Qwen2VLPromptImageEmbeddingInput]:
|
||||
image_batches: list[Union[Image.Image, list[Image.Image]]],
|
||||
processor,
|
||||
llm: VllmRunner,
|
||||
) -> list[Qwen2VLPromptImageEmbeddingInput]:
|
||||
"""batched image embeddings for Qwen2-VL
|
||||
|
||||
This will infer all images' embeddings in a single batch,
|
||||
This will infer all images' embeddings in a single batch,
|
||||
and split the result according to input batches.
|
||||
|
||||
image_batches:
|
||||
- Single-image batches: `list[Image.Image]`
|
||||
- Multiple-image batches: `list[list[Image.Image]]]`
|
||||
|
||||
|
||||
returns: `list[Qwen2VLPromptImageEmbeddingInput]`
|
||||
"""
|
||||
|
||||
@@ -108,9 +116,9 @@ def batch_make_image_embeddings(
|
||||
# image to pixel values
|
||||
image_processor = processor.image_processor
|
||||
|
||||
preprocess_result = image_processor \
|
||||
.preprocess(images=images, return_tensors="pt") \
|
||||
.data
|
||||
preprocess_result = image_processor.preprocess(
|
||||
images=images, return_tensors="pt"
|
||||
).data
|
||||
pixel_values = preprocess_result["pixel_values"]
|
||||
image_grid_thw = preprocess_result["image_grid_thw"]
|
||||
|
||||
@@ -119,12 +127,13 @@ def batch_make_image_embeddings(
|
||||
with torch.no_grad():
|
||||
visual = model.visual
|
||||
|
||||
pixel_values_on_device = pixel_values.to(visual.device,
|
||||
dtype=visual.dtype)
|
||||
image_grid_thw_on_device = image_grid_thw.to(visual.device,
|
||||
dtype=torch.int64)
|
||||
return visual(pixel_values_on_device,
|
||||
grid_thw=image_grid_thw_on_device).cpu()
|
||||
pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
|
||||
image_grid_thw_on_device = image_grid_thw.to(
|
||||
visual.device, dtype=torch.int64
|
||||
)
|
||||
return visual(
|
||||
pixel_values_on_device, grid_thw=image_grid_thw_on_device
|
||||
).cpu()
|
||||
|
||||
image_embeds = torch.concat(llm.apply_model(get_image_embeds))
|
||||
|
||||
@@ -137,16 +146,21 @@ def batch_make_image_embeddings(
|
||||
merge_size = image_processor.merge_size
|
||||
cur_batch_embed_len = sum(
|
||||
grid_thw.prod(-1) // merge_size // merge_size
|
||||
for grid_thw in image_grid_thw[image_counter:image_counter +
|
||||
cur_batch_image_count])
|
||||
for grid_thw in image_grid_thw[
|
||||
image_counter : image_counter + cur_batch_image_count
|
||||
]
|
||||
)
|
||||
|
||||
result.append({
|
||||
"image_embeds":
|
||||
image_embeds[embed_counter:embed_counter + cur_batch_embed_len],
|
||||
"image_grid_thw":
|
||||
image_grid_thw[image_counter:image_counter +
|
||||
cur_batch_image_count],
|
||||
})
|
||||
result.append(
|
||||
{
|
||||
"image_embeds": image_embeds[
|
||||
embed_counter : embed_counter + cur_batch_embed_len
|
||||
],
|
||||
"image_grid_thw": image_grid_thw[
|
||||
image_counter : image_counter + cur_batch_image_count
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
embed_counter += cur_batch_embed_len
|
||||
image_counter += cur_batch_image_count
|
||||
@@ -160,13 +174,13 @@ def batch_make_image_embeddings(
|
||||
|
||||
|
||||
def batch_make_video_embeddings(
|
||||
video_batches: PromptVideoInput, processor,
|
||||
llm: VllmRunner) -> list[Qwen2VLPromptVideoEmbeddingInput]:
|
||||
video_batches: PromptVideoInput, processor, llm: VllmRunner
|
||||
) -> list[Qwen2VLPromptVideoEmbeddingInput]:
|
||||
"""batched video embeddings for Qwen2-VL
|
||||
|
||||
A NDArray represents a single video's all frames.
|
||||
|
||||
This will infer all videos' embeddings in a single batch,
|
||||
This will infer all videos' embeddings in a single batch,
|
||||
and split the result according to input batches.
|
||||
|
||||
video_batches:
|
||||
@@ -191,9 +205,9 @@ def batch_make_video_embeddings(
|
||||
# video to pixel values
|
||||
image_processor = processor.image_processor
|
||||
|
||||
preprocess_result = image_processor \
|
||||
.preprocess(images=None, videos=videos, return_tensors="pt") \
|
||||
.data
|
||||
preprocess_result = image_processor.preprocess(
|
||||
images=None, videos=videos, return_tensors="pt"
|
||||
).data
|
||||
pixel_values = preprocess_result["pixel_values_videos"]
|
||||
video_grid_thw = preprocess_result["video_grid_thw"]
|
||||
|
||||
@@ -202,12 +216,13 @@ def batch_make_video_embeddings(
|
||||
with torch.no_grad():
|
||||
visual = model.visual
|
||||
|
||||
pixel_values_on_device = pixel_values.to(visual.device,
|
||||
dtype=visual.dtype)
|
||||
video_grid_thw_on_device = video_grid_thw.to(visual.device,
|
||||
dtype=torch.int64)
|
||||
return visual(pixel_values_on_device,
|
||||
grid_thw=video_grid_thw_on_device).cpu()
|
||||
pixel_values_on_device = pixel_values.to(visual.device, dtype=visual.dtype)
|
||||
video_grid_thw_on_device = video_grid_thw.to(
|
||||
visual.device, dtype=torch.int64
|
||||
)
|
||||
return visual(
|
||||
pixel_values_on_device, grid_thw=video_grid_thw_on_device
|
||||
).cpu()
|
||||
|
||||
video_embeds = torch.concat(llm.apply_model(get_image_embeds))
|
||||
|
||||
@@ -220,16 +235,21 @@ def batch_make_video_embeddings(
|
||||
merge_size = image_processor.merge_size
|
||||
cur_batch_embed_len = sum(
|
||||
grid_thw.prod(-1) // merge_size // merge_size
|
||||
for grid_thw in video_grid_thw[video_counter:video_counter +
|
||||
cur_batch_video_count])
|
||||
for grid_thw in video_grid_thw[
|
||||
video_counter : video_counter + cur_batch_video_count
|
||||
]
|
||||
)
|
||||
|
||||
result.append({
|
||||
"video_embeds":
|
||||
video_embeds[embed_counter:embed_counter + cur_batch_embed_len],
|
||||
"video_grid_thw":
|
||||
video_grid_thw[video_counter:video_counter +
|
||||
cur_batch_video_count],
|
||||
})
|
||||
result.append(
|
||||
{
|
||||
"video_embeds": video_embeds[
|
||||
embed_counter : embed_counter + cur_batch_embed_len
|
||||
],
|
||||
"video_grid_thw": video_grid_thw[
|
||||
video_counter : video_counter + cur_batch_video_count
|
||||
],
|
||||
}
|
||||
)
|
||||
|
||||
embed_counter += cur_batch_embed_len
|
||||
video_counter += cur_batch_video_count
|
||||
@@ -263,25 +283,24 @@ def run_embedding_input_test(
|
||||
|
||||
# max_model_len should be greater than image_feature_size
|
||||
with vllm_runner(
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=3,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={
|
||||
"image": mm_limit,
|
||||
"video": mm_limit
|
||||
},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
default_torch_num_threads=1,
|
||||
model,
|
||||
runner="generate",
|
||||
max_model_len=4000,
|
||||
max_num_seqs=3,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt={"image": mm_limit, "video": mm_limit},
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
default_torch_num_threads=1,
|
||||
) as vllm_model:
|
||||
outputs_per_case_for_original_input = [
|
||||
vllm_model.generate_greedy_logprobs(prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images or None,
|
||||
videos=videos or None)
|
||||
vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=images or None,
|
||||
videos=videos or None,
|
||||
)
|
||||
for prompts, images, videos in inputs
|
||||
]
|
||||
|
||||
@@ -290,17 +309,19 @@ def run_embedding_input_test(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
images=batch_make_image_embeddings(
|
||||
images, processor, vllm_model) if images else None,
|
||||
videos=batch_make_video_embeddings(
|
||||
videos, processor, vllm_model) if videos else None)
|
||||
images=batch_make_image_embeddings(images, processor, vllm_model)
|
||||
if images
|
||||
else None,
|
||||
videos=batch_make_video_embeddings(videos, processor, vllm_model)
|
||||
if videos
|
||||
else None,
|
||||
)
|
||||
for prompts, images, videos in inputs
|
||||
]
|
||||
|
||||
for outputs_for_original_input, \
|
||||
outputs_for_embeddings_input \
|
||||
in zip(outputs_per_case_for_original_input,
|
||||
outputs_per_case_for_embeddings_input):
|
||||
for outputs_for_original_input, outputs_for_embeddings_input in zip(
|
||||
outputs_per_case_for_original_input, outputs_per_case_for_embeddings_input
|
||||
):
|
||||
check_logprobs_close(
|
||||
outputs_0_lst=outputs_for_original_input,
|
||||
outputs_1_lst=outputs_for_embeddings_input,
|
||||
@@ -325,17 +346,26 @@ def run_embedding_input_test(
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
|
||||
size_factors, dtype, max_tokens,
|
||||
num_logprobs, monkeypatch) -> None:
|
||||
def test_qwen2_vl_image_embeddings_input(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype,
|
||||
max_tokens,
|
||||
num_logprobs,
|
||||
monkeypatch,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case: list[tuple[
|
||||
list[str], PromptImageInput, PromptVideoInput]] = [(
|
||||
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[rescale_image_size(image, factor) for factor in size_factors],
|
||||
[],
|
||||
) for image, prompt in zip(images, IMAGE_PROMPTS)]
|
||||
)
|
||||
for image, prompt in zip(images, IMAGE_PROMPTS)
|
||||
]
|
||||
|
||||
run_embedding_input_test(
|
||||
vllm_runner,
|
||||
@@ -366,21 +396,27 @@ def test_qwen2_vl_image_embeddings_input(vllm_runner, image_assets, model,
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
|
||||
model, size_factors,
|
||||
dtype: str, max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
def test_qwen2_vl_multiple_image_embeddings_input(
|
||||
vllm_runner,
|
||||
image_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
images = [asset.pil_image for asset in image_assets]
|
||||
|
||||
inputs_per_case: list[tuple[list[str], PromptImageInput,
|
||||
PromptVideoInput]] = [(
|
||||
[MULTIIMAGE_PROMPT for _ in size_factors],
|
||||
[[
|
||||
rescale_image_size(image, factor)
|
||||
for image in images
|
||||
] for factor in size_factors],
|
||||
[],
|
||||
)]
|
||||
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
|
||||
(
|
||||
[MULTIIMAGE_PROMPT for _ in size_factors],
|
||||
[
|
||||
[rescale_image_size(image, factor) for image in images]
|
||||
for factor in size_factors
|
||||
],
|
||||
[],
|
||||
)
|
||||
]
|
||||
|
||||
run_embedding_input_test(
|
||||
vllm_runner,
|
||||
@@ -410,22 +446,29 @@ def test_qwen2_vl_multiple_image_embeddings_input(vllm_runner, image_assets,
|
||||
@pytest.mark.parametrize("dtype", [target_dtype])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [10])
|
||||
def test_qwen2_vl_video_embeddings_input(vllm_runner, video_assets, model,
|
||||
size_factors, dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
def test_qwen2_vl_video_embeddings_input(
|
||||
vllm_runner,
|
||||
video_assets,
|
||||
model,
|
||||
size_factors,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
num_frames = 4
|
||||
sampled_vids = [
|
||||
sample_frames_from_video(asset.np_ndarrays, num_frames)
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
inputs_per_case: list[tuple[
|
||||
list[str], PromptImageInput, PromptVideoInput]] = [(
|
||||
inputs_per_case: list[tuple[list[str], PromptImageInput, PromptVideoInput]] = [
|
||||
(
|
||||
[prompt for _ in size_factors],
|
||||
[],
|
||||
[rescale_video_size(video, factor) for factor in size_factors],
|
||||
) for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)]
|
||||
)
|
||||
for video, prompt in zip(sampled_vids, VIDEO_PROMPTS)
|
||||
]
|
||||
|
||||
run_embedding_input_test(
|
||||
vllm_runner,
|
||||
|
||||
@@ -15,12 +15,12 @@ from ...registry import HF_EXAMPLE_MODELS
|
||||
|
||||
MODEL_NAME = "fixie-ai/ultravox-v0_5-llama-3_2-1b"
|
||||
|
||||
AUDIO_PROMPTS = AUDIO_ASSETS.prompts({
|
||||
"mary_had_lamb":
|
||||
"Transcribe this into English.",
|
||||
"winning_call":
|
||||
"What is happening in this audio clip?",
|
||||
})
|
||||
AUDIO_PROMPTS = AUDIO_ASSETS.prompts(
|
||||
{
|
||||
"mary_had_lamb": "Transcribe this into English.",
|
||||
"winning_call": "What is happening in this audio clip?",
|
||||
}
|
||||
)
|
||||
|
||||
MULTI_AUDIO_PROMPT = "Describe each of the audios above."
|
||||
|
||||
@@ -33,7 +33,7 @@ CHUNKED_PREFILL_KWARGS = {
|
||||
"enable_chunked_prefill": True,
|
||||
"max_num_seqs": 2,
|
||||
# Use a very small limit to exercise chunked prefill.
|
||||
"max_num_batched_tokens": 16
|
||||
"max_num_batched_tokens": 16,
|
||||
}
|
||||
|
||||
|
||||
@@ -43,27 +43,33 @@ def params_kwargs_to_cli_args(params_kwargs: dict[str, Any]) -> list[str]:
|
||||
for key, value in params_kwargs.items():
|
||||
if isinstance(value, bool):
|
||||
if value:
|
||||
args.append(f"--{key.replace('_','-')}")
|
||||
args.append(f"--{key.replace('_', '-')}")
|
||||
else:
|
||||
args.append(f"--{key.replace('_','-')}={value}")
|
||||
args.append(f"--{key.replace('_', '-')}={value}")
|
||||
return args
|
||||
|
||||
|
||||
@pytest.fixture(params=[
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
])
|
||||
@pytest.fixture(
|
||||
params=[
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
]
|
||||
)
|
||||
def server(request, audio_assets: AudioTestAssets):
|
||||
args = [
|
||||
"--dtype", "bfloat16", "--max-model-len", "4096", "--enforce-eager",
|
||||
"--dtype",
|
||||
"bfloat16",
|
||||
"--max-model-len",
|
||||
"4096",
|
||||
"--enforce-eager",
|
||||
"--limit-mm-per-prompt",
|
||||
json.dumps({"audio": len(audio_assets)}), "--trust-remote-code"
|
||||
json.dumps({"audio": len(audio_assets)}),
|
||||
"--trust-remote-code",
|
||||
] + params_kwargs_to_cli_args(request.param)
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME,
|
||||
args,
|
||||
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
|
||||
"30"}) as remote_server:
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@@ -77,12 +83,11 @@ def _get_prompt(audio_count, question, placeholder):
|
||||
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
|
||||
placeholder = f"{placeholder}\n" * audio_count
|
||||
|
||||
return tokenizer.apply_chat_template([{
|
||||
'role': 'user',
|
||||
'content': f"{placeholder}{question}"
|
||||
}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True)
|
||||
return tokenizer.apply_chat_template(
|
||||
[{"role": "user", "content": f"{placeholder}{question}"}],
|
||||
tokenize=False,
|
||||
add_generation_prompt=True,
|
||||
)
|
||||
|
||||
|
||||
def run_multi_audio_test(
|
||||
@@ -99,19 +104,21 @@ def run_multi_audio_test(
|
||||
model_info.check_available_online(on_fail="skip")
|
||||
model_info.check_transformers_version(on_fail="skip")
|
||||
|
||||
with vllm_runner(model,
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={
|
||||
"audio":
|
||||
max((len(audio) for _, audio in prompts_and_audios))
|
||||
},
|
||||
**kwargs) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype=dtype,
|
||||
enforce_eager=True,
|
||||
limit_mm_per_prompt={
|
||||
"audio": max((len(audio) for _, audio in prompts_and_audios))
|
||||
},
|
||||
**kwargs,
|
||||
) as vllm_model:
|
||||
vllm_outputs = vllm_model.generate_greedy_logprobs(
|
||||
[prompt for prompt, _ in prompts_and_audios],
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
audios=[audios for _, audios in prompts_and_audios])
|
||||
audios=[audios for _, audios in prompts_and_audios],
|
||||
)
|
||||
|
||||
# The HuggingFace model doesn't support multiple audios yet, so
|
||||
# just assert that some tokens were generated.
|
||||
@@ -122,21 +129,25 @@ def run_multi_audio_test(
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
@pytest.mark.parametrize("vllm_kwargs", [
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
])
|
||||
def test_models_with_multiple_audios(vllm_runner,
|
||||
audio_assets: AudioTestAssets, dtype: str,
|
||||
max_tokens: int, num_logprobs: int,
|
||||
vllm_kwargs: dict) -> None:
|
||||
|
||||
vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT,
|
||||
VLLM_PLACEHOLDER)
|
||||
@pytest.mark.parametrize(
|
||||
"vllm_kwargs",
|
||||
[
|
||||
pytest.param({}, marks=pytest.mark.cpu_model),
|
||||
pytest.param(CHUNKED_PREFILL_KWARGS),
|
||||
],
|
||||
)
|
||||
def test_models_with_multiple_audios(
|
||||
vllm_runner,
|
||||
audio_assets: AudioTestAssets,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
vllm_kwargs: dict,
|
||||
) -> None:
|
||||
vllm_prompt = _get_prompt(len(audio_assets), MULTI_AUDIO_PROMPT, VLLM_PLACEHOLDER)
|
||||
run_multi_audio_test(
|
||||
vllm_runner,
|
||||
[(vllm_prompt, [audio.audio_and_sample_rate
|
||||
for audio in audio_assets])],
|
||||
[(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
|
||||
MODEL_NAME,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
@@ -149,28 +160,25 @@ def test_models_with_multiple_audios(vllm_runner,
|
||||
async def test_online_serving(client, audio_assets: AudioTestAssets):
|
||||
"""Exercises online serving with/without chunked prefill enabled."""
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*[{
|
||||
"type": "audio_url",
|
||||
"audio_url": {
|
||||
"url": audio.url
|
||||
}
|
||||
} for audio in audio_assets],
|
||||
{
|
||||
"type":
|
||||
"text",
|
||||
"text":
|
||||
f"What's happening in these {len(audio_assets)} audio clips?"
|
||||
},
|
||||
],
|
||||
}]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*[
|
||||
{"type": "audio_url", "audio_url": {"url": audio.url}}
|
||||
for audio in audio_assets
|
||||
],
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"What's happening in these {len(audio_assets)} audio clips?",
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=10)
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME, messages=messages, max_tokens=10
|
||||
)
|
||||
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
|
||||
@@ -6,8 +6,12 @@ import json
|
||||
import pytest
|
||||
import pytest_asyncio
|
||||
from mistral_common.audio import Audio
|
||||
from mistral_common.protocol.instruct.messages import (AudioChunk, RawAudio,
|
||||
TextChunk, UserMessage)
|
||||
from mistral_common.protocol.instruct.messages import (
|
||||
AudioChunk,
|
||||
RawAudio,
|
||||
TextChunk,
|
||||
UserMessage,
|
||||
)
|
||||
|
||||
from vllm.transformers_utils.tokenizer import MistralTokenizer
|
||||
|
||||
@@ -17,8 +21,12 @@ from .test_ultravox import MULTI_AUDIO_PROMPT, run_multi_audio_test
|
||||
|
||||
MODEL_NAME = "mistralai/Voxtral-Mini-3B-2507"
|
||||
MISTRAL_FORMAT_ARGS = [
|
||||
"--tokenizer_mode", "mistral", "--config_format", "mistral",
|
||||
"--load_format", "mistral"
|
||||
"--tokenizer_mode",
|
||||
"mistral",
|
||||
"--config_format",
|
||||
"mistral",
|
||||
"--load_format",
|
||||
"mistral",
|
||||
]
|
||||
|
||||
|
||||
@@ -30,10 +38,9 @@ def server(request, audio_assets: AudioTestAssets):
|
||||
json.dumps({"audio": len(audio_assets)}),
|
||||
] + MISTRAL_FORMAT_ARGS
|
||||
|
||||
with RemoteOpenAIServer(MODEL_NAME,
|
||||
args,
|
||||
env_dict={"VLLM_AUDIO_FETCH_TIMEOUT":
|
||||
"30"}) as remote_server:
|
||||
with RemoteOpenAIServer(
|
||||
MODEL_NAME, args, env_dict={"VLLM_AUDIO_FETCH_TIMEOUT": "30"}
|
||||
) as remote_server:
|
||||
yield remote_server
|
||||
|
||||
|
||||
@@ -64,15 +71,17 @@ def _get_prompt(audio_assets, question):
|
||||
@pytest.mark.parametrize("dtype", ["half"])
|
||||
@pytest.mark.parametrize("max_tokens", [128])
|
||||
@pytest.mark.parametrize("num_logprobs", [5])
|
||||
def test_models_with_multiple_audios(vllm_runner,
|
||||
audio_assets: AudioTestAssets, dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int) -> None:
|
||||
def test_models_with_multiple_audios(
|
||||
vllm_runner,
|
||||
audio_assets: AudioTestAssets,
|
||||
dtype: str,
|
||||
max_tokens: int,
|
||||
num_logprobs: int,
|
||||
) -> None:
|
||||
vllm_prompt = _get_prompt(audio_assets, MULTI_AUDIO_PROMPT)
|
||||
run_multi_audio_test(
|
||||
vllm_runner,
|
||||
[(vllm_prompt, [audio.audio_and_sample_rate
|
||||
for audio in audio_assets])],
|
||||
[(vllm_prompt, [audio.audio_and_sample_rate for audio in audio_assets])],
|
||||
MODEL_NAME,
|
||||
dtype=dtype,
|
||||
max_tokens=max_tokens,
|
||||
@@ -92,23 +101,22 @@ async def test_online_serving(client, audio_assets: AudioTestAssets):
|
||||
return audio_dict
|
||||
|
||||
audio_chunks = [asset_to_chunk(asset) for asset in audio_assets]
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*audio_chunks,
|
||||
{
|
||||
"type":
|
||||
"text",
|
||||
"text":
|
||||
f"What's happening in these {len(audio_assets)} audio clips?"
|
||||
},
|
||||
],
|
||||
}]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*audio_chunks,
|
||||
{
|
||||
"type": "text",
|
||||
"text": f"What's happening in these {len(audio_assets)} audio clips?",
|
||||
},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
chat_completion = await client.chat.completions.create(model=MODEL_NAME,
|
||||
messages=messages,
|
||||
max_tokens=10)
|
||||
chat_completion = await client.chat.completions.create(
|
||||
model=MODEL_NAME, messages=messages, max_tokens=10
|
||||
)
|
||||
|
||||
assert len(chat_completion.choices) == 1
|
||||
choice = chat_completion.choices[0]
|
||||
|
||||
@@ -12,8 +12,7 @@ from ....utils import create_new_process_for_each_test, multi_gpu_test
|
||||
|
||||
PROMPTS = [
|
||||
{
|
||||
"prompt":
|
||||
"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
|
||||
"prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
|
||||
"multi_modal_data": {
|
||||
"audio": AudioAsset("mary_had_lamb").audio_and_sample_rate,
|
||||
},
|
||||
@@ -25,9 +24,8 @@ PROMPTS = [
|
||||
"audio": AudioAsset("winning_call").audio_and_sample_rate,
|
||||
},
|
||||
},
|
||||
"decoder_prompt":
|
||||
"<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
|
||||
}
|
||||
"decoder_prompt": "<|startoftranscript|><|en|><|transcribe|><|notimestamps|>",
|
||||
},
|
||||
]
|
||||
|
||||
EXPECTED = {
|
||||
@@ -41,7 +39,7 @@ EXPECTED = {
|
||||
" is June and the third base. They're going to wave him in. The throw"
|
||||
" to the plate will be late. The Mariners are going to play for the"
|
||||
" American League Championship. I don't believe it. It just continues"
|
||||
" by all five."
|
||||
" by all five.",
|
||||
],
|
||||
"openai/whisper-small": [
|
||||
" The first words I spoke in the original pornograph. A little piece"
|
||||
@@ -51,7 +49,7 @@ EXPECTED = {
|
||||
" comes joy. Here is Junior to third base. They're gonna wave him"
|
||||
" in. The throw to the plate will be late. The Mariners are going to"
|
||||
" play for the American League Championship. I don't believe it. It"
|
||||
" just continues. My, oh my."
|
||||
" just continues. My, oh my.",
|
||||
],
|
||||
"openai/whisper-medium": [
|
||||
" The first words I spoke in the original phonograph, a little piece"
|
||||
@@ -62,7 +60,7 @@ EXPECTED = {
|
||||
" Jorgen at third base. They're going to wave him in. The throw to the"
|
||||
" plate will be late. The Mariners are going to play for the American"
|
||||
" League Championship. I don't believe it. It just continues. My, oh"
|
||||
" my."
|
||||
" my.",
|
||||
],
|
||||
"openai/whisper-large-v3": [
|
||||
" The first words I spoke in the original phonograph, a little piece"
|
||||
@@ -73,7 +71,7 @@ EXPECTED = {
|
||||
" Junior to third base. They're going to wave him in. The throw to the"
|
||||
" plate will be late. The Mariners are going to play for the American"
|
||||
" League Championship. I don't believe it. It just continues. My, oh,"
|
||||
" my."
|
||||
" my.",
|
||||
],
|
||||
"openai/whisper-large-v3-turbo": [
|
||||
" The first words I spoke in the original phonograph, a little piece"
|
||||
@@ -84,8 +82,8 @@ EXPECTED = {
|
||||
" Junior to third base. They're going to wave him in. The throw to the"
|
||||
" plate will be late. The Mariners are going to play for the American"
|
||||
" League Championship. I don't believe it. It just continues. My, oh,"
|
||||
" my."
|
||||
]
|
||||
" my.",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
@@ -100,11 +98,11 @@ def run_test(
|
||||
expected_list = EXPECTED[model] * 10
|
||||
|
||||
with vllm_runner(
|
||||
model,
|
||||
dtype="half",
|
||||
max_model_len=448,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
model,
|
||||
dtype="half",
|
||||
max_model_len=448,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
) as vllm_model:
|
||||
llm = vllm_model.llm
|
||||
|
||||
|
||||
@@ -1,7 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Helpers for building inputs that can be leveraged for different test types.
|
||||
"""
|
||||
"""Helpers for building inputs that can be leveraged for different test types."""
|
||||
|
||||
from collections.abc import Iterable
|
||||
from pathlib import PosixPath
|
||||
from typing import Callable, Optional, Union
|
||||
@@ -10,20 +10,30 @@ import torch
|
||||
|
||||
from vllm.multimodal.audio import AudioResampler
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import (rescale_video_size, resize_video,
|
||||
sample_frames_from_video)
|
||||
from vllm.multimodal.video import (
|
||||
rescale_video_size,
|
||||
resize_video,
|
||||
sample_frames_from_video,
|
||||
)
|
||||
|
||||
from .....conftest import AudioTestAssets, ImageTestAssets, VideoTestAssets
|
||||
from .types import (SINGLE_AUDIO_BASE_PROMPT, SINGLE_IMAGE_BASE_PROMPTS,
|
||||
TEST_AUDIO_PLACEHOLDER, TEST_IMG_PLACEHOLDER,
|
||||
TEST_VIDEO_PLACEHOLDER, VIDEO_BASE_PROMPT,
|
||||
ImageSizeWrapper, PromptWithMultiModalInput, SizeType,
|
||||
VLMTestInfo)
|
||||
from .types import (
|
||||
SINGLE_AUDIO_BASE_PROMPT,
|
||||
SINGLE_IMAGE_BASE_PROMPTS,
|
||||
TEST_AUDIO_PLACEHOLDER,
|
||||
TEST_IMG_PLACEHOLDER,
|
||||
TEST_VIDEO_PLACEHOLDER,
|
||||
VIDEO_BASE_PROMPT,
|
||||
ImageSizeWrapper,
|
||||
PromptWithMultiModalInput,
|
||||
SizeType,
|
||||
VLMTestInfo,
|
||||
)
|
||||
|
||||
|
||||
def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
|
||||
str],
|
||||
test_placeholder: str) -> str:
|
||||
def replace_test_placeholder(
|
||||
prompt: str, mm_idx_to_prompt: Callable[[int], str], test_placeholder: str
|
||||
) -> str:
|
||||
"""Given a prompt, replaces each test placeholder with the
|
||||
model-specific tag.
|
||||
"""
|
||||
@@ -35,11 +45,13 @@ def replace_test_placeholder(prompt: str, mm_idx_to_prompt: Callable[[int],
|
||||
return img_prompt
|
||||
|
||||
|
||||
def get_model_prompts(base_prompts: Iterable[str],
|
||||
img_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
video_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
audio_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
prompt_formatter: Callable[[str], str]) -> list[str]:
|
||||
def get_model_prompts(
|
||||
base_prompts: Iterable[str],
|
||||
img_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
video_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
audio_idx_to_prompt: Optional[Callable[[int], str]],
|
||||
prompt_formatter: Callable[[str], str],
|
||||
) -> list[str]:
|
||||
"""Given a model-agnostic base prompt and test configuration for a model(s)
|
||||
to be tested, update the media placeholders and apply the prompt formatting
|
||||
to get the test prompt string for this model.
|
||||
@@ -56,19 +68,19 @@ def get_model_prompts(base_prompts: Iterable[str],
|
||||
# Replace the multimodal placeholders in the base prompt with
|
||||
# the correct ones for the model that we are testing
|
||||
if img_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(base_prompt,
|
||||
img_idx_to_prompt,
|
||||
TEST_IMG_PLACEHOLDER)
|
||||
base_prompt = replace_test_placeholder(
|
||||
base_prompt, img_idx_to_prompt, TEST_IMG_PLACEHOLDER
|
||||
)
|
||||
|
||||
if video_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(base_prompt,
|
||||
video_idx_to_prompt,
|
||||
TEST_VIDEO_PLACEHOLDER)
|
||||
base_prompt = replace_test_placeholder(
|
||||
base_prompt, video_idx_to_prompt, TEST_VIDEO_PLACEHOLDER
|
||||
)
|
||||
|
||||
if audio_idx_to_prompt:
|
||||
base_prompt = replace_test_placeholder(base_prompt,
|
||||
audio_idx_to_prompt,
|
||||
TEST_AUDIO_PLACEHOLDER)
|
||||
base_prompt = replace_test_placeholder(
|
||||
base_prompt, audio_idx_to_prompt, TEST_AUDIO_PLACEHOLDER
|
||||
)
|
||||
|
||||
# Apply the prompt formatter to wrap the base prompt with
|
||||
# the correct media placeholders to get the model test prompt
|
||||
@@ -84,14 +96,15 @@ def build_single_image_inputs_from_test_info(
|
||||
tmp_path: Optional[PosixPath] = None,
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError(
|
||||
"Prompt formatter must be set to build single image inputs")
|
||||
raise ValueError("Prompt formatter must be set to build single image inputs")
|
||||
|
||||
model_prompts = get_model_prompts(test_info.single_image_prompts,
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter)
|
||||
model_prompts = get_model_prompts(
|
||||
test_info.single_image_prompts,
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
# For models that require a local path / URL encoded in the image; export
|
||||
# assets and encode into tmp_path for this test. This should be avoided
|
||||
@@ -110,8 +123,8 @@ def build_single_image_inputs_from_test_info(
|
||||
|
||||
|
||||
def build_single_image_inputs(
|
||||
images, model_prompts,
|
||||
size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
|
||||
images, model_prompts, size_wrapper: ImageSizeWrapper
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
# For every image / prompt pair, get a pair containing two lists of
|
||||
# length size_factors, where the first contains duplicates of the model
|
||||
# prompt [str], and the second contains copies of the image after being
|
||||
@@ -125,7 +138,8 @@ def build_single_image_inputs(
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for size in size_wrapper.data
|
||||
],
|
||||
) for image, prompt in zip(images, model_prompts)
|
||||
)
|
||||
for image, prompt in zip(images, model_prompts)
|
||||
]
|
||||
|
||||
|
||||
@@ -136,14 +150,15 @@ def build_multi_image_inputs_from_test_info(
|
||||
tmp_path: Optional[PosixPath] = None,
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError(
|
||||
"Prompt formatter must be set to build multi image inputs")
|
||||
raise ValueError("Prompt formatter must be set to build multi image inputs")
|
||||
|
||||
model_prompts = get_model_prompts([test_info.multi_image_prompt],
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter)
|
||||
model_prompts = get_model_prompts(
|
||||
[test_info.multi_image_prompt],
|
||||
test_info.img_idx_to_prompt,
|
||||
test_info.video_idx_to_prompt,
|
||||
test_info.audio_idx_to_prompt,
|
||||
test_info.prompt_formatter,
|
||||
)
|
||||
|
||||
if test_info.prompt_path_encoder is not None:
|
||||
if tmp_path is None:
|
||||
@@ -164,16 +179,20 @@ def build_multi_image_inputs_from_test_info(
|
||||
|
||||
|
||||
def build_multi_image_inputs(
|
||||
image_lists, model_prompts,
|
||||
size_wrapper: ImageSizeWrapper) -> list[PromptWithMultiModalInput]:
|
||||
image_lists, model_prompts, size_wrapper: ImageSizeWrapper
|
||||
) -> list[PromptWithMultiModalInput]:
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=[prompt for _ in size_wrapper.data],
|
||||
image_data=[[
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for image in images
|
||||
] for size in size_wrapper.data],
|
||||
) for images, prompt in zip(image_lists, model_prompts)
|
||||
image_data=[
|
||||
[
|
||||
apply_image_size_scaling(image, size, size_wrapper.type)
|
||||
for image in images
|
||||
]
|
||||
for size in size_wrapper.data
|
||||
],
|
||||
)
|
||||
for images, prompt in zip(image_lists, model_prompts)
|
||||
]
|
||||
|
||||
|
||||
@@ -185,10 +204,10 @@ def build_embedding_inputs_from_test_info(
|
||||
# These conditions will always be true if invoked through filtering,
|
||||
# but we still check them in case this is ever called directly
|
||||
if test_info.prompt_formatter is None:
|
||||
raise ValueError(
|
||||
"Prompt formatter must be set to build image embedding inputs")
|
||||
if size_wrapper.type != SizeType.SIZE_FACTOR or not \
|
||||
all(factor == 1.0 for factor in size_wrapper.data):
|
||||
raise ValueError("Prompt formatter must be set to build image embedding inputs")
|
||||
if size_wrapper.type != SizeType.SIZE_FACTOR or not all(
|
||||
factor == 1.0 for factor in size_wrapper.data
|
||||
):
|
||||
raise ValueError("Embedding tests require constant (1.0) size factors")
|
||||
if test_info.convert_assets_to_embeddings is None:
|
||||
raise ValueError("No conversion func for getting embeddings found")
|
||||
@@ -209,8 +228,7 @@ def build_embedding_inputs_from_test_info(
|
||||
assert len(images) == len(model_prompts)
|
||||
|
||||
inputs = build_single_image_inputs(images, model_prompts, size_wrapper)
|
||||
vllm_embeddings = build_single_image_inputs(embeds, model_prompts,
|
||||
size_wrapper)
|
||||
vllm_embeddings = build_single_image_inputs(embeds, model_prompts, size_wrapper)
|
||||
return inputs, vllm_embeddings
|
||||
|
||||
|
||||
@@ -235,21 +253,22 @@ def build_video_inputs_from_test_info(
|
||||
for asset in video_assets
|
||||
]
|
||||
|
||||
video_scaler = (resize_video if size_wrapper.type == SizeType.FIXED_SIZE
|
||||
else rescale_video_size)
|
||||
video_scaler = (
|
||||
resize_video if size_wrapper.type == SizeType.FIXED_SIZE else rescale_video_size
|
||||
)
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
prompts=[prompt for _ in size_wrapper.data],
|
||||
video_data=[
|
||||
video_scaler(video, size) for size in size_wrapper.data
|
||||
],
|
||||
) for video, prompt in zip(sampled_vids, model_prompts)
|
||||
video_data=[video_scaler(video, size) for size in size_wrapper.data],
|
||||
)
|
||||
for video, prompt in zip(sampled_vids, model_prompts)
|
||||
]
|
||||
|
||||
|
||||
def apply_image_size_scaling(image, size: Union[float, tuple[int, int]],
|
||||
size_type: SizeType):
|
||||
def apply_image_size_scaling(
|
||||
image, size: Union[float, tuple[int, int]], size_type: SizeType
|
||||
):
|
||||
"""Applies a size scaler to one image; this can be an image size factor,
|
||||
which scales the image while maintaining the aspect ratio"""
|
||||
# Special case for embeddings; if it's a tensor, it's only valid if we
|
||||
@@ -285,13 +304,16 @@ def build_audio_inputs_from_test_info(
|
||||
method="librosa",
|
||||
)
|
||||
audios = [asset.audio_and_sample_rate for asset in audio_assets]
|
||||
resampled_audios = [(
|
||||
resampler.resample(
|
||||
audio,
|
||||
orig_sr=sr,
|
||||
),
|
||||
int(resampler.target_sr),
|
||||
) for audio, sr in audios]
|
||||
resampled_audios = [
|
||||
(
|
||||
resampler.resample(
|
||||
audio,
|
||||
orig_sr=sr,
|
||||
),
|
||||
int(resampler.target_sr),
|
||||
)
|
||||
for audio, sr in audios
|
||||
]
|
||||
|
||||
return [
|
||||
PromptWithMultiModalInput(
|
||||
|
||||
@@ -4,19 +4,28 @@
|
||||
modality, getting all combinations (similar to pytest's parametrization),
|
||||
handling multimodal placeholder substitution, and so on.
|
||||
"""
|
||||
|
||||
import itertools
|
||||
from collections import OrderedDict
|
||||
from collections.abc import Iterable
|
||||
|
||||
import pytest
|
||||
|
||||
from .types import (EMBEDDING_SIZE_FACTORS, ExpandableVLMTestArgs,
|
||||
ImageSizeWrapper, SizeType, VLMTestInfo, VLMTestType)
|
||||
from .types import (
|
||||
EMBEDDING_SIZE_FACTORS,
|
||||
ExpandableVLMTestArgs,
|
||||
ImageSizeWrapper,
|
||||
SizeType,
|
||||
VLMTestInfo,
|
||||
VLMTestType,
|
||||
)
|
||||
|
||||
|
||||
def get_filtered_test_settings(
|
||||
test_settings: dict[str, VLMTestInfo], test_type: VLMTestType,
|
||||
new_proc_per_test: bool) -> dict[str, VLMTestInfo]:
|
||||
test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
new_proc_per_test: bool,
|
||||
) -> dict[str, VLMTestInfo]:
|
||||
"""Given the dict of potential test settings to run, return a subdict
|
||||
of tests who have the current test type enabled with the matching val for
|
||||
fork_per_test.
|
||||
@@ -25,7 +34,8 @@ def get_filtered_test_settings(
|
||||
def matches_test_type(test_info: VLMTestInfo, test_type: VLMTestType):
|
||||
return test_info.test_type == test_type or (
|
||||
isinstance(test_info.test_type, Iterable)
|
||||
and test_type in test_info.test_type)
|
||||
and test_type in test_info.test_type
|
||||
)
|
||||
|
||||
matching_tests = {}
|
||||
for test_name, test_info in test_settings.items():
|
||||
@@ -36,62 +46,69 @@ def get_filtered_test_settings(
|
||||
assert test_info.convert_assets_to_embeddings is not None
|
||||
# Custom test inputs need to explicitly define the mm limit/inputs
|
||||
if matches_test_type(test_info, VLMTestType.CUSTOM_INPUTS):
|
||||
assert (test_info.custom_test_opts is not None
|
||||
and isinstance(test_info.custom_test_opts, Iterable))
|
||||
assert test_info.custom_test_opts is not None and isinstance(
|
||||
test_info.custom_test_opts, Iterable
|
||||
)
|
||||
# For all types besides custom inputs, we need a prompt formatter
|
||||
else:
|
||||
assert test_info.prompt_formatter is not None
|
||||
|
||||
# Everything looks okay; keep if this is correct proc handling
|
||||
if (test_info.distributed_executor_backend
|
||||
is not None) == new_proc_per_test:
|
||||
if (
|
||||
test_info.distributed_executor_backend is not None
|
||||
) == new_proc_per_test:
|
||||
matching_tests[test_name] = test_info
|
||||
|
||||
return matching_tests
|
||||
|
||||
|
||||
def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
create_new_process_for_each_test: bool):
|
||||
def get_parametrized_options(
|
||||
test_settings: dict[str, VLMTestInfo],
|
||||
test_type: VLMTestType,
|
||||
create_new_process_for_each_test: bool,
|
||||
):
|
||||
"""Converts all of our VLMTestInfo into an expanded list of parameters.
|
||||
This is similar to nesting pytest parametrize calls, but done directly
|
||||
through an itertools product so that each test can set things like
|
||||
size factors etc, while still running in isolated test cases.
|
||||
"""
|
||||
matching_tests = get_filtered_test_settings(
|
||||
test_settings, test_type, create_new_process_for_each_test)
|
||||
test_settings, test_type, create_new_process_for_each_test
|
||||
)
|
||||
|
||||
# Ensure that something is wrapped as an iterable it's not already
|
||||
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e, )
|
||||
ensure_wrapped = lambda e: e if isinstance(e, (list, tuple)) else (e,)
|
||||
|
||||
def get_model_type_cases(model_type: str, test_info: VLMTestInfo):
|
||||
# This is essentially the same as nesting a bunch of mark.parametrize
|
||||
# decorators, but we do it programmatically to allow overrides for on
|
||||
# a per-model basis, while still being able to execute each of these
|
||||
# as individual test cases in pytest.
|
||||
iter_kwargs = OrderedDict([
|
||||
("model", ensure_wrapped(test_info.models)),
|
||||
("max_tokens", ensure_wrapped(test_info.max_tokens)),
|
||||
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
|
||||
("dtype", ensure_wrapped(test_info.dtype)),
|
||||
("distributed_executor_backend",
|
||||
ensure_wrapped(test_info.distributed_executor_backend)),
|
||||
])
|
||||
iter_kwargs = OrderedDict(
|
||||
[
|
||||
("model", ensure_wrapped(test_info.models)),
|
||||
("max_tokens", ensure_wrapped(test_info.max_tokens)),
|
||||
("num_logprobs", ensure_wrapped(test_info.num_logprobs)),
|
||||
("dtype", ensure_wrapped(test_info.dtype)),
|
||||
(
|
||||
"distributed_executor_backend",
|
||||
ensure_wrapped(test_info.distributed_executor_backend),
|
||||
),
|
||||
]
|
||||
)
|
||||
|
||||
# num_frames is video only
|
||||
if test_type == VLMTestType.VIDEO:
|
||||
iter_kwargs["num_video_frames"] = ensure_wrapped(
|
||||
test_info.num_video_frames)
|
||||
iter_kwargs["num_video_frames"] = ensure_wrapped(test_info.num_video_frames)
|
||||
|
||||
# No sizes passed for custom inputs, since inputs are directly provided
|
||||
if test_type not in (VLMTestType.CUSTOM_INPUTS, VLMTestType.AUDIO):
|
||||
wrapped_sizes = get_wrapped_test_sizes(test_info, test_type)
|
||||
if wrapped_sizes is None:
|
||||
raise ValueError(
|
||||
f"Sizes must be set for test type {test_type}")
|
||||
raise ValueError(f"Sizes must be set for test type {test_type}")
|
||||
iter_kwargs["size_wrapper"] = wrapped_sizes
|
||||
|
||||
#Otherwise expand the custom test options instead
|
||||
# Otherwise expand the custom test options instead
|
||||
elif test_type == VLMTestType.CUSTOM_INPUTS:
|
||||
if test_info.custom_test_opts is None:
|
||||
raise ValueError("Test has type CUSTOM_INPUTS, but none given")
|
||||
@@ -121,8 +138,8 @@ def get_parametrized_options(test_settings: dict[str, VLMTestInfo],
|
||||
|
||||
|
||||
def get_wrapped_test_sizes(
|
||||
test_info: VLMTestInfo,
|
||||
test_type: VLMTestType) -> tuple[ImageSizeWrapper, ...]:
|
||||
test_info: VLMTestInfo, test_type: VLMTestType
|
||||
) -> tuple[ImageSizeWrapper, ...]:
|
||||
"""Given a test info which may have size factors or fixed sizes, wrap them
|
||||
and combine them into an iterable, each of which will be used in parameter
|
||||
expansion.
|
||||
@@ -133,18 +150,18 @@ def get_wrapped_test_sizes(
|
||||
"""
|
||||
# If it is an embedding test, we always use the EMBEDDING_SIZE_FACTORS
|
||||
if test_type == VLMTestType.EMBEDDING:
|
||||
return tuple([
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
for factor in EMBEDDING_SIZE_FACTORS
|
||||
])
|
||||
return tuple(
|
||||
[
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
for factor in EMBEDDING_SIZE_FACTORS
|
||||
]
|
||||
)
|
||||
# Audio and Custom inputs have preprocessed inputs
|
||||
elif test_type in (VLMTestType.AUDIO, VLMTestType.CUSTOM_INPUTS):
|
||||
return tuple()
|
||||
|
||||
size_factors = test_info.image_size_factors \
|
||||
if test_info.image_size_factors else []
|
||||
fixed_sizes = test_info.image_sizes \
|
||||
if test_info.image_sizes else []
|
||||
size_factors = test_info.image_size_factors if test_info.image_size_factors else []
|
||||
fixed_sizes = test_info.image_sizes if test_info.image_sizes else []
|
||||
|
||||
wrapped_factors = [
|
||||
ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=factor)
|
||||
@@ -152,8 +169,7 @@ def get_wrapped_test_sizes(
|
||||
]
|
||||
|
||||
wrapped_sizes = [
|
||||
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size)
|
||||
for size in fixed_sizes
|
||||
ImageSizeWrapper(type=SizeType.FIXED_SIZE, data=size) for size in fixed_sizes
|
||||
]
|
||||
|
||||
return tuple(wrapped_factors + wrapped_sizes)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Core test implementation to be shared across modalities."""
|
||||
|
||||
from typing import Any, Callable, Optional
|
||||
|
||||
import torch
|
||||
@@ -70,22 +71,23 @@ def run_test(
|
||||
if model_info.hf_overrides:
|
||||
vllm_runner_kwargs_["hf_overrides"] = model_info.hf_overrides
|
||||
if model_info.skip_tokenizer_init:
|
||||
vllm_runner_kwargs_[
|
||||
"skip_tokenizer_init"] = model_info.skip_tokenizer_init
|
||||
vllm_runner_kwargs_["skip_tokenizer_init"] = model_info.skip_tokenizer_init
|
||||
|
||||
if vllm_runner_kwargs:
|
||||
vllm_runner_kwargs_.update(vllm_runner_kwargs)
|
||||
|
||||
with vllm_runner(model,
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=max_num_seqs,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=enforce_eager,
|
||||
runner=runner,
|
||||
**vllm_runner_kwargs_) as vllm_model:
|
||||
with vllm_runner(
|
||||
model,
|
||||
max_model_len=max_model_len,
|
||||
max_num_seqs=max_num_seqs,
|
||||
dtype=dtype,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
distributed_executor_backend=distributed_executor_backend,
|
||||
enforce_eager=enforce_eager,
|
||||
runner=runner,
|
||||
**vllm_runner_kwargs_,
|
||||
) as vllm_model:
|
||||
tokenizer = vllm_model.llm.get_tokenizer()
|
||||
|
||||
vllm_kwargs: dict[str, Any] = {}
|
||||
@@ -95,21 +97,19 @@ def run_test(
|
||||
vllm_kwargs["stop"] = stop_str
|
||||
|
||||
for prompts, image_data, video_data, audio_data in vllm_inputs:
|
||||
mm_data = dict(images=image_data,
|
||||
videos=video_data,
|
||||
audios=audio_data)
|
||||
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
|
||||
vllm_kwargs_with_mm_data = vllm_kwargs | mm_data
|
||||
vllm_output = vllm_model.generate_greedy_logprobs(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
**vllm_kwargs_with_mm_data)
|
||||
**vllm_kwargs_with_mm_data,
|
||||
)
|
||||
vllm_outputs_per_mm.append(vllm_output)
|
||||
|
||||
hf_model = hf_runner(model,
|
||||
dtype=dtype,
|
||||
auto_cls=auto_cls,
|
||||
model_kwargs=hf_model_kwargs)
|
||||
hf_model = hf_runner(
|
||||
model, dtype=dtype, auto_cls=auto_cls, model_kwargs=hf_model_kwargs
|
||||
)
|
||||
|
||||
# Some models need to patch things like the model processor, e.g., internvl
|
||||
if patch_hf_runner is not None:
|
||||
@@ -129,16 +129,15 @@ def run_test(
|
||||
hf_kwargs["stop_strings"] = stop_str
|
||||
|
||||
for prompts, image_data, video_data, audio_data in inputs:
|
||||
mm_data = dict(images=image_data,
|
||||
videos=video_data,
|
||||
audios=audio_data)
|
||||
mm_data = dict(images=image_data, videos=video_data, audios=audio_data)
|
||||
hf_kwargs_with_mm_data = hf_kwargs | mm_data
|
||||
hf_output = hf_model.generate_greedy_logprobs_limit(
|
||||
prompts,
|
||||
max_tokens,
|
||||
num_logprobs=num_logprobs,
|
||||
tokenizer=tokenizer,
|
||||
**hf_kwargs_with_mm_data)
|
||||
**hf_kwargs_with_mm_data,
|
||||
)
|
||||
hf_outputs_per_mm.append(hf_output)
|
||||
|
||||
# Apply output processing / sanitation to the vLLM and HF runner results
|
||||
@@ -150,8 +149,7 @@ def run_test(
|
||||
second_runner_processor=vllm_output_post_proc,
|
||||
)
|
||||
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm,
|
||||
vllm_outputs_per_mm):
|
||||
for hf_outputs, vllm_outputs in zip(hf_outputs_per_mm, vllm_outputs_per_mm):
|
||||
# This is usually check_logprobs_close, but it's passed through to
|
||||
# allow things like check_outputs_equal where needed
|
||||
comparator(
|
||||
@@ -171,15 +169,19 @@ def process_runner_outputs(
|
||||
):
|
||||
"""Applies the runner processor(s) to the runner outputs, if any."""
|
||||
if first_runner_processor is not None:
|
||||
first_runner_outputs = process_outputs(first_runner_processor, model,
|
||||
first_runner_outputs)
|
||||
first_runner_outputs = process_outputs(
|
||||
first_runner_processor, model, first_runner_outputs
|
||||
)
|
||||
if second_runner_processor is not None:
|
||||
second_runner_outputs = process_outputs(second_runner_processor, model,
|
||||
second_runner_outputs)
|
||||
second_runner_outputs = process_outputs(
|
||||
second_runner_processor, model, second_runner_outputs
|
||||
)
|
||||
return first_runner_outputs, second_runner_outputs
|
||||
|
||||
|
||||
def process_outputs(output_processor, model, outputs_per_image):
|
||||
"""Applies a model specific post-processor function to a runner's output"""
|
||||
return [[output_processor(res, model) for res in outputs]
|
||||
for outputs in outputs_per_image]
|
||||
return [
|
||||
[output_processor(res, model) for res in outputs]
|
||||
for outputs in outputs_per_image
|
||||
]
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Custom input builders for edge-cases in different models."""
|
||||
|
||||
from typing import Callable
|
||||
|
||||
from vllm.assets.image import ImageAsset
|
||||
from vllm.multimodal.image import rescale_image_size
|
||||
from vllm.multimodal.video import (rescale_video_size, resize_video,
|
||||
sample_frames_from_video)
|
||||
from vllm.multimodal.video import (
|
||||
rescale_video_size,
|
||||
resize_video,
|
||||
sample_frames_from_video,
|
||||
)
|
||||
|
||||
from .....conftest import IMAGE_ASSETS, VIDEO_ASSETS
|
||||
from .builders import build_multi_image_inputs, build_single_image_inputs
|
||||
@@ -15,7 +19,7 @@ from .types import ImageSizeWrapper, PromptWithMultiModalInput, SizeType
|
||||
|
||||
def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
|
||||
"""Builds inputs for multi-image (varied sizes/aspect ratio) testing.
|
||||
|
||||
|
||||
Args:
|
||||
formatter: model-specific prompt formatter.
|
||||
"""
|
||||
@@ -41,7 +45,7 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
|
||||
stop_sign,
|
||||
rescale_image_size(stop_sign, 0.25),
|
||||
cherry_blossom.resize((183, 488)),
|
||||
cherry_blossom.resize((488, 183))
|
||||
cherry_blossom.resize((488, 183)),
|
||||
],
|
||||
cherry_blossom,
|
||||
]
|
||||
@@ -54,10 +58,11 @@ def multi_image_multi_aspect_ratio_inputs(formatter: Callable[[str], str]):
|
||||
]
|
||||
|
||||
|
||||
def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
|
||||
num_frames: int = 16):
|
||||
def multi_video_multi_aspect_ratio_inputs(
|
||||
formatter: Callable[[str], str], num_frames: int = 16
|
||||
):
|
||||
"""Builds inputs for multi-video (varied sizes/aspect ratio) testing.
|
||||
|
||||
|
||||
Args:
|
||||
formatter: model-specific prompt formatter.
|
||||
"""
|
||||
@@ -81,7 +86,7 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
|
||||
video,
|
||||
rescale_video_size(video, 0.25),
|
||||
resize_video(video, (183, 488)),
|
||||
resize_video(video, (488, 183))
|
||||
resize_video(video, (488, 183)),
|
||||
],
|
||||
video,
|
||||
]
|
||||
@@ -96,7 +101,9 @@ def multi_video_multi_aspect_ratio_inputs(formatter: Callable[[str], str],
|
||||
|
||||
def different_patch_input_cases_internvl():
|
||||
images = [asset.pil_image.resize((896, 896)) for asset in IMAGE_ASSETS]
|
||||
formatter = lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n" # noqa: E501
|
||||
formatter = (
|
||||
lambda img_prompt: f"<|im_start|>User\n{img_prompt}<|im_end|>\n<|im_start|>Assistant\n"
|
||||
) # noqa: E501
|
||||
single_img_prompts = [
|
||||
"<image>\nWhat's the content in the center of the image?",
|
||||
"<image>\nWhat is the season?",
|
||||
@@ -115,14 +122,14 @@ def different_patch_input_cases_internvl():
|
||||
|
||||
|
||||
def windows_attention_image_qwen2_5_vl():
|
||||
|
||||
# image from regression issue: https://github.com/vllm-project/vllm/issues/15122 # noqa: E501
|
||||
image = ImageAsset("hato").pil_image
|
||||
|
||||
question = "Describe the image."
|
||||
img_prompt = "<|vision_start|><|image_pad|><|vision_end|>"
|
||||
prompt = (f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n"
|
||||
"<|im_start|>assistant\n")
|
||||
prompt = (
|
||||
f"<|im_start|>User\n{img_prompt}{question}<|im_end|>\n<|im_start|>assistant\n"
|
||||
)
|
||||
|
||||
wrapped_sf = ImageSizeWrapper(type=SizeType.SIZE_FACTOR, data=[0.5])
|
||||
return build_single_image_inputs([image], [prompt], wrapped_sf)
|
||||
@@ -136,8 +143,9 @@ def video_with_metadata_glm4_1v():
|
||||
formatted_prompt = f"<|user|>\n{video_prompt}{question}<|assistant|>\n"
|
||||
|
||||
scales = [0.1, 0.2, 0.25]
|
||||
video_input = [[(rescale_video_size(video_array, scale), metadata)]
|
||||
for scale in scales]
|
||||
video_input = [
|
||||
[(rescale_video_size(video_array, scale), metadata)] for scale in scales
|
||||
]
|
||||
prompts = [formatted_prompt] * len(video_input)
|
||||
|
||||
return [
|
||||
|
||||
@@ -4,6 +4,7 @@
|
||||
for manipulating the input / output of HF & vLLM test runners, which are
|
||||
typically specific to a small subset of models.
|
||||
"""
|
||||
|
||||
import types
|
||||
from pathlib import PosixPath
|
||||
from typing import Optional, Union
|
||||
@@ -15,8 +16,13 @@ import pytest
|
||||
import regex as re
|
||||
import torch
|
||||
from PIL.Image import Image
|
||||
from transformers import (AutoConfig, AutoTokenizer, BatchFeature,
|
||||
GenerationConfig, GenerationMixin)
|
||||
from transformers import (
|
||||
AutoConfig,
|
||||
AutoTokenizer,
|
||||
BatchFeature,
|
||||
GenerationConfig,
|
||||
GenerationMixin,
|
||||
)
|
||||
from transformers.video_utils import VideoMetadata
|
||||
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
@@ -27,8 +33,7 @@ from .types import RunnerOutput
|
||||
|
||||
|
||||
####### vLLM output processors functions
|
||||
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def blip2_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [blip2 models] to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -42,8 +47,7 @@ def blip2_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [fuyu models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -53,8 +57,8 @@ def fuyu_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
|
||||
|
||||
def qwen_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [qwen models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -64,8 +68,8 @@ def qwen_vllm_to_hf_output(
|
||||
|
||||
|
||||
def qwen2_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [qwen2 models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -75,8 +79,8 @@ def qwen2_vllm_to_hf_output(
|
||||
|
||||
|
||||
def kimiv_vl_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
"""Sanitize vllm output [kimi_vl models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -85,23 +89,25 @@ def kimiv_vl_vllm_to_hf_output(
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def llava_image_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def llava_image_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> RunnerOutput:
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
mm_token_id = config.image_token_index
|
||||
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
|
||||
|
||||
|
||||
def llava_video_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput,
|
||||
model: str) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> tuple[list[int], str, Optional[SampleLogprobs]]:
|
||||
config = AutoConfig.from_pretrained(model)
|
||||
mm_token_id = config.video_token_index
|
||||
return _llava_vllm_to_hf_output(vllm_output, model, mm_token_id)
|
||||
|
||||
|
||||
def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
|
||||
mm_token_id: int) -> RunnerOutput:
|
||||
def _llava_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput, model: str, mm_token_id: int
|
||||
) -> RunnerOutput:
|
||||
"""Sanitize vllm output [Llava models] to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -109,7 +115,8 @@ def _llava_vllm_to_hf_output(vllm_output: RunnerOutput, model: str,
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
token_id
|
||||
for idx, token_id in enumerate(output_ids)
|
||||
if token_id != mm_token_id or output_ids[idx - 1] != mm_token_id
|
||||
]
|
||||
|
||||
@@ -128,8 +135,9 @@ def llava_onevision_hf_model_kwargs(model: str) -> dict:
|
||||
return config.to_dict()
|
||||
|
||||
|
||||
def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def llava_onevision_vllm_to_hf_output(
|
||||
vllm_output: RunnerOutput, model: str
|
||||
) -> RunnerOutput:
|
||||
"""Sanitize vllm output [llava-onevision] to compare with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -140,7 +148,8 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
token_id
|
||||
for idx, token_id in enumerate(output_ids)
|
||||
if token_id != video_token_id or output_ids[idx - 1] != video_token_id
|
||||
]
|
||||
|
||||
@@ -151,8 +160,7 @@ def llava_onevision_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def mantis_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [mantis] to compare with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -161,8 +169,7 @@ def mantis_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
return output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output [phi3v] to be comparable with hf output."""
|
||||
_, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -180,8 +187,7 @@ def phi3v_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
return hf_output_ids, hf_output_str, out_logprobs
|
||||
|
||||
|
||||
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
"""Sanitize vllm output to be comparable with hf output."""
|
||||
output_ids, output_str, out_logprobs = vllm_output
|
||||
|
||||
@@ -192,7 +198,8 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
eos_token_id = tokenizer.eos_token_id
|
||||
|
||||
hf_output_ids = [
|
||||
token_id for idx, token_id in enumerate(output_ids)
|
||||
token_id
|
||||
for idx, token_id in enumerate(output_ids)
|
||||
if token_id != image_token_id or output_ids[idx - 1] != image_token_id
|
||||
]
|
||||
|
||||
@@ -205,46 +212,40 @@ def paligemma_vllm_to_hf_output(vllm_output: RunnerOutput,
|
||||
|
||||
|
||||
####### Post-processors for HF outputs
|
||||
def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def deepseekvl2_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<|end▁of▁sentence|>"):
|
||||
output_str = output_str.split("<|end▁of▁sentence|>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def idefics3_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def idefics3_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<end_of_utterance>"):
|
||||
output_str = output_str.split("<end_of_utterance>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def smolvlm_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def smolvlm_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
# Based on Idefics3
|
||||
return idefics3_trunc_hf_output(hf_output, model)
|
||||
|
||||
|
||||
def minicpmv_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def minicpmv_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<|eot_id|>"):
|
||||
output_str = output_str.split("<|eot_id|>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def minimax_vl_01_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def minimax_vl_01_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
if output_str.endswith("<end_of_sentence>"):
|
||||
output_str = output_str.split("<end_of_sentence>")[0]
|
||||
return output_ids, output_str, out_logprobs
|
||||
|
||||
|
||||
def ultravox_trunc_hf_output(hf_output: RunnerOutput,
|
||||
model: str) -> RunnerOutput:
|
||||
def ultravox_trunc_hf_output(hf_output: RunnerOutput, model: str) -> RunnerOutput:
|
||||
output_ids, output_str, out_logprobs = hf_output
|
||||
|
||||
tokenizer = AutoTokenizer.from_pretrained(model)
|
||||
@@ -262,8 +263,8 @@ def get_llava_embeddings(image_assets: ImageTestAssets):
|
||||
|
||||
####### Prompt path encoders for models that need models on disk
|
||||
def qwen_prompt_path_encoder(
|
||||
tmp_path: PosixPath, prompt: str,
|
||||
assets: Union[list[ImageAsset], ImageTestAssets]) -> str:
|
||||
tmp_path: PosixPath, prompt: str, assets: Union[list[ImageAsset], ImageTestAssets]
|
||||
) -> str:
|
||||
"""Given a temporary dir path, export one or more image assets into the
|
||||
tempdir & replace its contents with the local path to the string so that
|
||||
the HF version of Qwen-VL can resolve the path and load the image in its
|
||||
@@ -313,8 +314,9 @@ def deepseekvl2_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
return BatchFeature(data=inputs, tensor_type="pt")
|
||||
|
||||
hf_model.processor = processor
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language.model.embed_tokens
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.language.model.embed_tokens
|
||||
)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -357,11 +359,10 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
assert len(contents) == len(images)
|
||||
|
||||
return hf_processor.apply_chat_template(
|
||||
[{
|
||||
"role": "user",
|
||||
"image": image,
|
||||
"content": content
|
||||
} for image, content in zip(images, contents)],
|
||||
[
|
||||
{"role": "user", "image": image, "content": content}
|
||||
for image, content in zip(images, contents)
|
||||
],
|
||||
add_generation_prompt=True,
|
||||
tokenize=True,
|
||||
return_dict=True,
|
||||
@@ -369,8 +370,9 @@ def glm4v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
)
|
||||
|
||||
hf_model.processor = processor
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.transformer.output_layer
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.transformer.output_layer
|
||||
)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -387,10 +389,9 @@ def glm4_1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
else:
|
||||
video_metadata = None
|
||||
|
||||
return hf_processor(*args,
|
||||
videos=videos,
|
||||
video_metadata=video_metadata,
|
||||
**kwargs)
|
||||
return hf_processor(
|
||||
*args, videos=videos, video_metadata=video_metadata, **kwargs
|
||||
)
|
||||
|
||||
hf_model.processor = processor
|
||||
return hf_model
|
||||
@@ -406,8 +407,9 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.config = AutoConfig.from_pretrained(
|
||||
hf_runner.model_name, trust_remote_code=True
|
||||
)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.use_msac = self.config.use_msac
|
||||
@@ -415,11 +417,14 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.max_num = self.config.max_dynamic_patch
|
||||
self.image_size = self.vision_config.image_size
|
||||
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]],
|
||||
**kwargs):
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
|
||||
# yapf: disable
|
||||
from vllm.model_executor.models.h2ovl import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START, image_to_pixel_values_h2ovl)
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
image_to_pixel_values_h2ovl,
|
||||
)
|
||||
|
||||
# yapf: enable
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
@@ -431,29 +436,26 @@ def h2ovl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
use_msac=self.use_msac,
|
||||
) for image in images
|
||||
]
|
||||
num_patches_list = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
for num_patches in num_patches_list:
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
text = text.replace("<image>", image_tokens, 1)
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = H2OVLProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.language_model.get_output_embeddings()
|
||||
)
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -467,19 +469,23 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.config = AutoConfig.from_pretrained(
|
||||
hf_runner.model_name, trust_remote_code=True
|
||||
)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.min_num = self.config.min_dynamic_patch
|
||||
self.max_num = self.config.max_dynamic_patch
|
||||
self.image_size = self.vision_config.image_size
|
||||
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]],
|
||||
**kwargs):
|
||||
def __call__(self, text: str, images: Union[Image, list[Image]], **kwargs):
|
||||
from vllm.model_executor.models.skyworkr1v import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START,
|
||||
image_to_pixel_values_skyworkr1v)
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
image_to_pixel_values_skyworkr1v,
|
||||
)
|
||||
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
pixel_values = [
|
||||
image_to_pixel_values_skyworkr1v(
|
||||
@@ -488,29 +494,26 @@ def skyworkr1v_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
min_num=self.min_num,
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
) for image in images
|
||||
]
|
||||
num_patches_list = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
num_patches_list = [pixel_value.shape[0] for pixel_value in pixel_values]
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
for num_patches in num_patches_list:
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
text = text.replace("<image>", image_tokens, 1)
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = SkyworkR1VProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.language_model.get_output_embeddings()
|
||||
)
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -524,8 +527,9 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
self.num_image_token = hf_runner.model.num_image_token
|
||||
self.tokenizer = hf_runner.tokenizer
|
||||
|
||||
self.config = AutoConfig.from_pretrained(hf_runner.model_name,
|
||||
trust_remote_code=True)
|
||||
self.config = AutoConfig.from_pretrained(
|
||||
hf_runner.model_name, trust_remote_code=True
|
||||
)
|
||||
self.vision_config = self.config.vision_config
|
||||
self.use_thumbnail = self.config.use_thumbnail
|
||||
self.min_num = self.config.min_dynamic_patch
|
||||
@@ -540,8 +544,13 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
**kwargs,
|
||||
):
|
||||
from vllm.model_executor.models.internvl import (
|
||||
IMG_CONTEXT, IMG_END, IMG_START,
|
||||
image_to_pixel_values_internvl, video_to_pixel_values_internvl)
|
||||
IMG_CONTEXT,
|
||||
IMG_END,
|
||||
IMG_START,
|
||||
image_to_pixel_values_internvl,
|
||||
video_to_pixel_values_internvl,
|
||||
)
|
||||
|
||||
images = [images] if isinstance(images, Image) else images
|
||||
videos = [videos] if isinstance(videos, np.ndarray) else videos
|
||||
if images is not None:
|
||||
@@ -552,7 +561,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
min_num=self.min_num,
|
||||
max_num=self.max_num,
|
||||
use_thumbnail=self.use_thumbnail,
|
||||
) for image in images
|
||||
)
|
||||
for image in images
|
||||
]
|
||||
num_patches_images = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values_images
|
||||
@@ -568,7 +578,8 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
min_num=1,
|
||||
max_num=1,
|
||||
use_thumbnail=False,
|
||||
) for video in videos
|
||||
)
|
||||
for video in videos
|
||||
]
|
||||
num_patches_videos = [
|
||||
pixel_value.shape[0] for pixel_value in pixel_values_videos
|
||||
@@ -580,38 +591,37 @@ def internvl_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
while ("<image>" in text) or ("<video>" in text):
|
||||
image_index = text.find("<image>")
|
||||
video_index = text.find("<video>")
|
||||
if image_index == -1 or (video_index > -1
|
||||
and video_index < image_index):
|
||||
if image_index == -1 or (
|
||||
video_index > -1 and video_index < image_index
|
||||
):
|
||||
num_patches = num_patches_videos.pop(0)
|
||||
pixel_values.append(pixel_values_videos.pop(0))
|
||||
context_tokens = IMG_START + \
|
||||
IMG_CONTEXT * self.num_image_token + IMG_END
|
||||
video_tokens = ''.join([
|
||||
f'Frame{i+1}: {context_tokens}'
|
||||
for i in range(num_patches)
|
||||
])
|
||||
text = text.replace('<video>', video_tokens, 1)
|
||||
context_tokens = (
|
||||
IMG_START + IMG_CONTEXT * self.num_image_token + IMG_END
|
||||
)
|
||||
video_tokens = "".join(
|
||||
[f"Frame{i + 1}: {context_tokens}" for i in range(num_patches)]
|
||||
)
|
||||
text = text.replace("<video>", video_tokens, 1)
|
||||
else:
|
||||
num_patches = num_patches_images.pop(0)
|
||||
pixel_values.append(pixel_values_images.pop(0))
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token \
|
||||
* num_patches
|
||||
context_tokens = IMG_CONTEXT * self.num_image_token * num_patches
|
||||
image_tokens = IMG_START + context_tokens + IMG_END
|
||||
text = text.replace('<image>', image_tokens, 1)
|
||||
text = text.replace("<image>", image_tokens, 1)
|
||||
pixel_values = torch.cat(pixel_values, dim=0)
|
||||
|
||||
prompt = self.tokenizer(text, return_tensors="pt")
|
||||
prompt.update({"pixel_values": pixel_values})
|
||||
return prompt
|
||||
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids(
|
||||
"<IMG_CONTEXT>")
|
||||
img_context_token_id = hf_model.tokenizer.convert_tokens_to_ids("<IMG_CONTEXT>")
|
||||
hf_model.model.img_context_token_id = img_context_token_id
|
||||
hf_model.processor = InternVLProcessor(hf_model)
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.language_model.get_output_embeddings()
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate,
|
||||
hf_model.model)
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.language_model.get_output_embeddings()
|
||||
)
|
||||
hf_model.model.generate = types.MethodType(_internvl_generate, hf_model.model)
|
||||
return hf_model
|
||||
|
||||
|
||||
@@ -631,7 +641,7 @@ def _internvl_generate(
|
||||
input_embeds = input_embeds.reshape(B * N, C)
|
||||
|
||||
input_ids = input_ids.reshape(B * N)
|
||||
selected = (input_ids == self.img_context_token_id)
|
||||
selected = input_ids == self.img_context_token_id
|
||||
assert selected.sum() != 0
|
||||
input_embeds[selected] = vit_embeds.reshape(-1, C).to(input_embeds.device)
|
||||
|
||||
@@ -778,8 +788,9 @@ def molmo_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
|
||||
def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.llm.get_output_embeddings()
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.llm.get_output_embeddings()
|
||||
)
|
||||
|
||||
def processor(*args, text="", images=None, **kwargs):
|
||||
text_tokenizer = hf_model.model.get_text_tokenizer()
|
||||
@@ -787,8 +798,7 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
|
||||
prompt_start_and_end = {
|
||||
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
|
||||
"llama":
|
||||
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
|
||||
"llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
|
||||
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
|
||||
}
|
||||
for start, end in prompt_start_and_end.values():
|
||||
@@ -797,7 +807,8 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
break
|
||||
|
||||
prompt, input_ids, pixel_values = hf_model.model.preprocess_inputs(
|
||||
text_or_conversations=text, images=images)
|
||||
text_or_conversations=text, images=images
|
||||
)
|
||||
attention_mask = torch.ne(input_ids, text_tokenizer.pad_token_id)
|
||||
|
||||
inputs = {
|
||||
@@ -813,8 +824,9 @@ def ovis_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
|
||||
def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
"""Patches and returns an instance of the HfRunner to use for Ovis2."""
|
||||
hf_model.model.get_output_embeddings = lambda: \
|
||||
hf_model.model.llm.get_output_embeddings()
|
||||
hf_model.model.get_output_embeddings = (
|
||||
lambda: hf_model.model.llm.get_output_embeddings()
|
||||
)
|
||||
|
||||
def processor(*args, text="", images=None, videos=None, **kwargs):
|
||||
if images is None:
|
||||
@@ -825,13 +837,11 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
videos = []
|
||||
else:
|
||||
videos = [videos] if isinstance(videos, np.ndarray) else videos
|
||||
videos = [[PIL.Image.fromarray(frame) for frame in vid]
|
||||
for vid in videos]
|
||||
videos = [[PIL.Image.fromarray(frame) for frame in vid] for vid in videos]
|
||||
|
||||
prompt_start_and_end = {
|
||||
"qwen2": ("<|im_start|>user\n", "<|im_end|>\n"),
|
||||
"llama":
|
||||
("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
|
||||
"llama": ("<|start_header_id|>user<|end_header_id|>\n\n", "<|eot_id|>"),
|
||||
"gemma2": ("<start_of_turn>user\n", "<end_of_turn>\n"),
|
||||
}
|
||||
for start, end in prompt_start_and_end.values():
|
||||
@@ -842,21 +852,20 @@ def ovis2_5_patch_hf_runner(hf_model: HfRunner) -> HfRunner:
|
||||
images_message = [{"type": "image", "image": img} for img in images]
|
||||
videos_message = [{"type": "video", "video": vid} for vid in videos]
|
||||
|
||||
messages = [{
|
||||
"role":
|
||||
"user",
|
||||
"content": [
|
||||
*images_message,
|
||||
*videos_message,
|
||||
{
|
||||
"type": "text",
|
||||
"text": text
|
||||
},
|
||||
],
|
||||
}]
|
||||
messages = [
|
||||
{
|
||||
"role": "user",
|
||||
"content": [
|
||||
*images_message,
|
||||
*videos_message,
|
||||
{"type": "text", "text": text},
|
||||
],
|
||||
}
|
||||
]
|
||||
|
||||
input_ids, pixel_values, grid_thws = hf_model.model.preprocess_inputs(
|
||||
messages=messages, enable_thinking=True)
|
||||
messages=messages, enable_thinking=True
|
||||
)
|
||||
inputs = {
|
||||
"inputs": input_ids,
|
||||
"pixel_values": pixel_values,
|
||||
|
||||
@@ -3,23 +3,34 @@
|
||||
"""Entrypoints for wrapping the core run_test implementation for specific test
|
||||
types / modalities.
|
||||
"""
|
||||
|
||||
from pathlib import PosixPath
|
||||
|
||||
from .....conftest import (AudioTestAssets, HfRunner, ImageTestAssets,
|
||||
VideoTestAssets, VllmRunner)
|
||||
from .....conftest import (
|
||||
AudioTestAssets,
|
||||
HfRunner,
|
||||
ImageTestAssets,
|
||||
VideoTestAssets,
|
||||
VllmRunner,
|
||||
)
|
||||
from . import builders, core
|
||||
from .types import ExpandableVLMTestArgs, VLMTestInfo
|
||||
|
||||
|
||||
####### Entrypoints for running different test types
|
||||
def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets):
|
||||
def run_single_image_test(
|
||||
*,
|
||||
tmp_path: PosixPath,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs = builders.build_single_image_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
@@ -31,17 +42,23 @@ def run_single_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets):
|
||||
def run_multi_image_test(
|
||||
*,
|
||||
tmp_path: PosixPath,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs = builders.build_multi_image_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path)
|
||||
model_test_info, image_assets, test_case.size_wrapper, tmp_path
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
@@ -53,17 +70,22 @@ def run_multi_image_test(*, tmp_path: PosixPath, model_test_info: VLMTestInfo,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"image": len(image_assets)},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_embedding_test(*, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets):
|
||||
def run_embedding_test(
|
||||
*,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
image_assets: ImageTestAssets,
|
||||
):
|
||||
assert test_case.size_wrapper is not None
|
||||
inputs, vllm_embeddings = builders.build_embedding_inputs_from_test_info(
|
||||
model_test_info, image_assets, test_case.size_wrapper)
|
||||
model_test_info, image_assets, test_case.size_wrapper
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
@@ -76,7 +98,8 @@ def run_embedding_test(*, model_test_info: VLMTestInfo,
|
||||
limit_mm_per_prompt={"image": 1},
|
||||
vllm_embeddings=vllm_embeddings,
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_video_test(
|
||||
@@ -90,8 +113,11 @@ def run_video_test(
|
||||
assert test_case.size_wrapper is not None
|
||||
assert test_case.num_video_frames is not None
|
||||
inputs = builders.build_video_inputs_from_test_info(
|
||||
model_test_info, video_assets, test_case.size_wrapper,
|
||||
test_case.num_video_frames)
|
||||
model_test_info,
|
||||
video_assets,
|
||||
test_case.size_wrapper,
|
||||
test_case.num_video_frames,
|
||||
)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
@@ -103,7 +129,8 @@ def run_video_test(
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"video": len(video_assets)},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_audio_test(
|
||||
@@ -114,8 +141,7 @@ def run_audio_test(
|
||||
vllm_runner: type[VllmRunner],
|
||||
audio_assets: AudioTestAssets,
|
||||
):
|
||||
inputs = builders.build_audio_inputs_from_test_info(
|
||||
model_test_info, audio_assets)
|
||||
inputs = builders.build_audio_inputs_from_test_info(model_test_info, audio_assets)
|
||||
|
||||
core.run_test(
|
||||
hf_runner=hf_runner,
|
||||
@@ -127,13 +153,17 @@ def run_audio_test(
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt={"audio": 1},
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
|
||||
def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner]):
|
||||
def run_custom_inputs_test(
|
||||
*,
|
||||
model_test_info: VLMTestInfo,
|
||||
test_case: ExpandableVLMTestArgs,
|
||||
hf_runner: type[HfRunner],
|
||||
vllm_runner: type[VllmRunner],
|
||||
):
|
||||
# Custom test cases can provide inputs directly, but they need to
|
||||
# explicitly provided a CustomTestConfig, which wraps the inputs and
|
||||
# the limit_mm_per_prompt
|
||||
@@ -155,4 +185,5 @@ def run_custom_inputs_test(*, model_test_info: VLMTestInfo,
|
||||
num_logprobs=test_case.num_logprobs,
|
||||
limit_mm_per_prompt=limit_mm_per_prompt,
|
||||
distributed_executor_backend=test_case.distributed_executor_backend,
|
||||
**model_test_info.get_non_parametrized_runner_kwargs())
|
||||
**model_test_info.get_non_parametrized_runner_kwargs(),
|
||||
)
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Types for writing multimodal model tests."""
|
||||
|
||||
from collections.abc import Iterable
|
||||
from enum import Enum
|
||||
from pathlib import PosixPath
|
||||
@@ -15,9 +16,16 @@ from vllm.config import RunnerOption
|
||||
from vllm.logprobs import SampleLogprobs
|
||||
from vllm.transformers_utils.tokenizer import AnyTokenizer
|
||||
|
||||
from .....conftest import (AUDIO_ASSETS, IMAGE_ASSETS, HfRunner, ImageAsset,
|
||||
ImageTestAssets, PromptAudioInput, PromptImageInput,
|
||||
PromptVideoInput)
|
||||
from .....conftest import (
|
||||
AUDIO_ASSETS,
|
||||
IMAGE_ASSETS,
|
||||
HfRunner,
|
||||
ImageAsset,
|
||||
ImageTestAssets,
|
||||
PromptAudioInput,
|
||||
PromptImageInput,
|
||||
PromptVideoInput,
|
||||
)
|
||||
from ....utils import check_logprobs_close
|
||||
|
||||
# meta image tag; will be replaced by the appropriate tag for the model
|
||||
@@ -47,6 +55,7 @@ RunnerOutput = tuple[list[int], str, Optional[SampleLogprobs]]
|
||||
|
||||
class PromptWithMultiModalInput(NamedTuple):
|
||||
"""Holds the multimodal input for a single test case."""
|
||||
|
||||
prompts: list[str]
|
||||
image_data: Optional[PromptImageInput] = None
|
||||
video_data: Optional[PromptVideoInput] = None
|
||||
@@ -100,8 +109,9 @@ class VLMTestInfo(NamedTuple):
|
||||
|
||||
# Function for converting ImageAssets to image embeddings;
|
||||
# We need to define this explicitly for embedding tests
|
||||
convert_assets_to_embeddings: Optional[Callable[[ImageTestAssets],
|
||||
list[torch.Tensor]]] = None
|
||||
convert_assets_to_embeddings: Optional[
|
||||
Callable[[ImageTestAssets], list[torch.Tensor]]
|
||||
] = None
|
||||
|
||||
# Exposed options for vLLM runner; we change these in a several tests,
|
||||
# but the defaults are derived from VllmRunner & the engine defaults
|
||||
@@ -156,8 +166,8 @@ class VLMTestInfo(NamedTuple):
|
||||
# for Qwen-VL, which requires encoding the image path / url into the prompt
|
||||
# for HF runner
|
||||
prompt_path_encoder: Optional[
|
||||
Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]],
|
||||
str]] = None # noqa: E501
|
||||
Callable[[PosixPath, str, Union[list[ImageAsset], ImageTestAssets]], str]
|
||||
] = None # noqa: E501
|
||||
|
||||
# Allows configuring a test to run with custom inputs
|
||||
custom_test_opts: Optional[list[CustomTestOptions]] = None
|
||||
@@ -190,6 +200,7 @@ class VLMTestInfo(NamedTuple):
|
||||
|
||||
class ExpandableVLMTestArgs(NamedTuple):
|
||||
"""The expanded kwargs which correspond to a single test case."""
|
||||
|
||||
model: str
|
||||
max_tokens: int
|
||||
num_logprobs: int
|
||||
|
||||
Reference in New Issue
Block a user