2025-02-02 14:58:18 -05:00
# SPDX-License-Identifier: Apache-2.0
2025-06-03 11:20:17 -07:00
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
2024-10-30 10:32:17 -06:00
""" Common tests for testing .generate() functionality for single / multiple
image , embedding , and video support for different VLMs in vLLM .
"""
2025-10-05 15:06:22 +01:00
2024-12-12 06:18:16 +08:00
import math
from collections import defaultdict
2024-10-30 10:32:17 -06:00
from pathlib import PosixPath
import pytest
2025-11-12 19:38:13 +00:00
from packaging . version import Version
2025-05-16 17:18:08 +08:00
from transformers import (
AutoModel ,
2025-11-15 00:07:20 +08:00
AutoModelForCausalLM ,
2025-05-16 17:18:08 +08:00
AutoModelForImageTextToText ,
2025-09-17 20:19:15 +08:00
AutoModelForTextToWaveform ,
)
2025-11-12 19:38:13 +00:00
from transformers import __version__ as TRANSFORMERS_VERSION
2024-10-30 10:32:17 -06:00
from vllm . platforms import current_platform
2025-10-19 20:20:55 +08:00
from vllm . utils . func_utils import identity
2024-10-30 10:32:17 -06:00
2025-05-16 17:18:08 +08:00
from . . . . conftest import (
IMAGE_ASSETS ,
AudioTestAssets ,
HfRunner ,
ImageTestAssets ,
VideoTestAssets ,
VllmRunner ,
)
2025-03-17 19:33:35 +08:00
from . . . . utils import create_new_process_for_each_test , large_gpu_mark , multi_gpu_marks
2024-10-30 10:32:17 -06:00
from . . . utils import check_outputs_equal
from . vlm_utils import custom_inputs , model_utils , runners
from . vlm_utils . case_filtering import get_parametrized_options
from . vlm_utils . types import (
CustomTestOptions ,
ExpandableVLMTestArgs ,
VLMTestInfo ,
VLMTestType ,
)
COMMON_BROADCAST_SETTINGS = {
" test_type " : VLMTestType . IMAGE ,
" dtype " : " half " ,
" max_tokens " : 5 ,
" tensor_parallel_size " : 2 ,
2024-12-08 01:10:05 +08:00
" hf_model_kwargs " : { " device_map " : " auto " } ,
2025-10-05 17:18:11 +01:00
" image_size_factors " : [ ( 0.25 , 0.5 , 1.0 ) ] ,
2024-10-30 10:32:17 -06:00
" distributed_executor_backend " : (
" ray " ,
" mp " ,
2025-10-05 17:18:11 +01:00
) ,
2024-10-30 10:32:17 -06:00
}
### Test configuration for specific models
# NOTE: The convention of the test settings below is to lead each test key
# with the name of the model arch used in the test, using underscores in place
# of hyphens; this makes it more convenient to filter tests for a specific kind
# of model. For example....
#
# To run all test types for a specific key:
# use the k flag to substring match with a leading square bracket; if the
# model arch happens to be a substring of another one, you can add a
# trailing hyphen. E.g.,
# - pytest $TEST_FILE -k "[llava-"
# prevents matching on "[llava_next-" & will match just the enabled cases
# for llava, i.e., single image, image embedding, and custom input tests.
#
# To run a test for a Test Info for just one of multiple models:
# use the k flag to substring match the model name, e.g.,
# - pytest $TEST_FILE -k OpenGVLab/InternVL2-1B
# prevents matching on nGVLab/InternVL2-2B.
#
# You can also combine substrings to match more granularly.
# ex 1:
# pytest $TEST_FILE -k "test_single_image and OpenGVLab/InternVL2-1B"
# will run only test_single_image* for OpenGVLab/InternVL2-1B; this would
# match both wrappers for single image tests, since it also matches
# test_single_image_heavy (which forks if we have a distributed backend)
# ex 2:
# pytest $TEST_FILE -k "[llava- or [intern_vl-"
# will run all of the tests for only llava & internvl.
#
# NOTE you can add --collect-only to any of the above commands to see
# which cases would be selected and deselected by pytest. In general,
# this is a good idea for checking your command first, since tests are slow.
VLM_TEST_SETTINGS = {
2024-10-31 10:10:52 -06:00
#### Core tests to always run in the CI
" llava " : VLMTestInfo (
models = [ " llava-hf/llava-1.5-7b-hf " ] ,
2025-10-05 17:18:11 +01:00
test_type = ( VLMTestType . EMBEDDING , VLMTestType . IMAGE , VLMTestType . CUSTOM_INPUTS ) ,
2024-10-31 10:10:52 -06:00
prompt_formatter = lambda img_prompt : f " USER: { img_prompt } \n ASSISTANT: " ,
convert_assets_to_embeddings = model_utils . get_llava_embeddings ,
max_model_len = 4096 ,
2025-03-18 02:35:17 +08:00
auto_cls = AutoModelForImageTextToText ,
2024-10-31 10:10:52 -06:00
vllm_output_post_proc = model_utils . llava_image_vllm_to_hf_output ,
2025-10-05 17:18:11 +01:00
custom_test_opts = [
CustomTestOptions (
inputs = custom_inputs . multi_image_multi_aspect_ratio_inputs (
formatter = lambda img_prompt : f " USER: { img_prompt } \n ASSISTANT: "
) ,
limit_mm_per_prompt = { " image " : 4 } ,
)
] ,
2025-10-22 11:52:02 -04:00
vllm_runner_kwargs = { " enable_mm_embeds " : True } ,
2024-11-08 23:30:04 +08:00
marks = [ pytest . mark . core_model , pytest . mark . cpu_model ] ,
2024-10-31 10:10:52 -06:00
) ,
2025-10-22 14:05:34 -03:00
" paligemma " : VLMTestInfo (
models = [ " google/paligemma-3b-mix-224 " ] ,
test_type = VLMTestType . IMAGE ,
prompt_formatter = identity ,
img_idx_to_prompt = lambda idx : " " ,
# Paligemma uses its own sample prompts because the default one fails
single_image_prompts = IMAGE_ASSETS . prompts (
{
" stop_sign " : " caption es " ,
" cherry_blossom " : " What is in the picture? " ,
}
) ,
auto_cls = AutoModelForImageTextToText ,
vllm_output_post_proc = model_utils . paligemma_vllm_to_hf_output ,
dtype = " bfloat16 " ,
marks = [
pytest . mark . skip ( reason = " vLLM does not support PrefixLM attention mask " )
] ,
) ,
2025-02-05 13:31:38 -08:00
" qwen2_5_vl " : VLMTestInfo (
models = [ " Qwen/Qwen2.5-VL-3B-Instruct " ] ,
2025-10-05 17:18:11 +01:00
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE , VLMTestType . VIDEO ) ,
prompt_formatter = lambda img_prompt : f " <|im_start|>User \n { img_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
2025-10-06 06:12:40 +01:00
img_idx_to_prompt = lambda idx : " <|vision_start|><|image_pad|><|vision_end|> " ,
video_idx_to_prompt = lambda idx : " <|vision_start|><|video_pad|><|vision_end|> " ,
2025-11-13 08:38:08 -08:00
enforce_eager = False ,
2025-02-05 13:31:38 -08:00
max_model_len = 4096 ,
max_num_seqs = 2 ,
2025-09-17 20:19:15 +08:00
auto_cls = AutoModelForImageTextToText ,
2025-02-05 13:31:38 -08:00
vllm_output_post_proc = model_utils . qwen2_vllm_to_hf_output ,
image_size_factors = [ ( ) , ( 0.25 , ) , ( 0.25 , 0.25 , 0.25 ) , ( 0.25 , 0.2 , 0.15 ) ] ,
2025-02-13 22:19:15 +08:00
marks = [ pytest . mark . core_model , pytest . mark . cpu_model ] ,
2025-02-05 13:31:38 -08:00
) ,
2025-04-19 14:14:36 +08:00
" qwen2_5_omni " : VLMTestInfo (
2025-05-04 01:08:14 +08:00
models = [ " Qwen/Qwen2.5-Omni-3B " ] ,
2025-10-05 17:18:11 +01:00
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE , VLMTestType . VIDEO ) ,
prompt_formatter = lambda img_prompt : f " <|im_start|>User \n { img_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
2025-10-06 06:12:40 +01:00
img_idx_to_prompt = lambda idx : " <|vision_bos|><|IMAGE|><|vision_eos|> " ,
video_idx_to_prompt = lambda idx : " <|vision_bos|><|VIDEO|><|vision_eos|> " ,
2025-04-19 14:14:36 +08:00
max_model_len = 4096 ,
max_num_seqs = 2 ,
2025-10-05 17:18:11 +01:00
num_logprobs = 6 if current_platform . is_cpu ( ) else 5 ,
2025-05-04 01:08:14 +08:00
auto_cls = AutoModelForTextToWaveform ,
2025-04-19 14:14:36 +08:00
vllm_output_post_proc = model_utils . qwen2_vllm_to_hf_output ,
2025-05-04 01:08:14 +08:00
patch_hf_runner = model_utils . qwen2_5_omni_patch_hf_runner ,
2025-04-19 14:14:36 +08:00
image_size_factors = [ ( ) , ( 0.25 , ) , ( 0.25 , 0.25 , 0.25 ) , ( 0.25 , 0.2 , 0.15 ) ] ,
marks = [ pytest . mark . core_model , pytest . mark . cpu_model ] ,
) ,
2025-10-29 20:19:37 +08:00
" qwen3_vl " : VLMTestInfo (
models = [ " Qwen/Qwen3-VL-4B-Instruct " ] ,
test_type = (
VLMTestType . IMAGE ,
VLMTestType . MULTI_IMAGE ,
VLMTestType . VIDEO ,
) ,
2025-11-13 08:38:08 -08:00
enforce_eager = False ,
2025-10-29 20:19:37 +08:00
needs_video_metadata = True ,
prompt_formatter = lambda img_prompt : f " <|im_start|>User \n { img_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
img_idx_to_prompt = lambda idx : " <|vision_start|><|image_pad|><|vision_end|> " , # noqa: E501
video_idx_to_prompt = lambda idx : " <|vision_start|><|video_pad|><|vision_end|> " , # noqa: E501
max_model_len = 4096 ,
max_num_seqs = 2 ,
num_logprobs = 20 ,
auto_cls = AutoModelForImageTextToText ,
vllm_output_post_proc = model_utils . qwen2_vllm_to_hf_output ,
patch_hf_runner = model_utils . qwen3_vl_patch_hf_runner ,
image_size_factors = [ ( ) , ( 0.25 , ) , ( 0.25 , 0.25 , 0.25 ) , ( 0.25 , 0.2 , 0.15 ) ] ,
marks = [
pytest . mark . core_model ,
] ,
) ,
2025-05-16 17:18:08 +08:00
" ultravox " : VLMTestInfo (
2025-10-05 17:18:11 +01:00
models = [ " fixie-ai/ultravox-v0_5-llama-3_2-1b " ] ,
2025-05-16 17:18:08 +08:00
test_type = VLMTestType . AUDIO ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda audio_prompt : f " <|begin_of_text|><|start_header_id|>user<|end_header_id|> \n \n { audio_prompt } <|eot_id|><|start_header_id|>assistant<|end_header_id|> \n \n " , # noqa: E501
2025-05-16 17:18:08 +08:00
audio_idx_to_prompt = lambda idx : " <|audio|> " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
auto_cls = AutoModel ,
hf_output_post_proc = model_utils . ultravox_trunc_hf_output ,
marks = [ pytest . mark . core_model , pytest . mark . cpu_model ] ,
) ,
2025-07-20 15:25:50 +02:00
#### Transformers fallback to test
## To reduce test burden, we only test batching arbitrary image size
# Dynamic image length and number of patches
" llava-onevision-transformers " : VLMTestInfo (
models = [ " llava-hf/llava-onevision-qwen2-0.5b-ov-hf " ] ,
test_type = VLMTestType . IMAGE ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda vid_prompt : f " <|im_start|>user \n { vid_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
2025-07-20 15:25:50 +02:00
max_model_len = 16384 ,
2025-10-05 17:18:11 +01:00
hf_model_kwargs = model_utils . llava_onevision_hf_model_kwargs (
" llava-hf/llava-onevision-qwen2-0.5b-ov-hf "
2025-10-06 06:12:40 +01:00
) ,
2025-07-20 15:25:50 +02:00
auto_cls = AutoModelForImageTextToText ,
vllm_output_post_proc = model_utils . llava_onevision_vllm_to_hf_output ,
image_size_factors = [ ( 0.25 , 0.5 , 1.0 ) ] ,
vllm_runner_kwargs = {
" model_impl " : " transformers " ,
2025-09-21 01:50:58 +08:00
" default_torch_num_threads " : 1 ,
2025-07-20 15:25:50 +02:00
} ,
2025-09-21 01:50:58 +08:00
# FIXME: Investigate why the test hangs
# when processing the 3rd prompt in vLLM
marks = [ pytest . mark . core_model , pytest . mark . skip ( reason = " Test hangs " ) ] ,
2025-07-20 15:25:50 +02:00
) ,
2025-10-06 20:42:32 +02:00
# Gemma3 has bidirectional mask on images
" gemma3-transformers " : VLMTestInfo (
models = [ " google/gemma-3-4b-it " ] ,
2025-10-22 14:05:34 -03:00
test_type = VLMTestType . IMAGE ,
prompt_formatter = lambda vid_prompt : f " < ' <bos><start_of_turn>user \n { vid_prompt } <start_of_image><end_of_turn> \n <start_of_turn>model \n " , # noqa: E501
max_model_len = 4096 ,
2025-10-06 20:42:32 +02:00
auto_cls = AutoModelForImageTextToText ,
vllm_output_post_proc = model_utils . gemma3_vllm_to_hf_output ,
image_size_factors = [ ( 0.25 , 0.5 , 1.0 ) ] ,
vllm_runner_kwargs = {
" model_impl " : " transformers " ,
} ,
marks = [ pytest . mark . core_model ] ,
) ,
2025-08-28 14:51:24 +08:00
" idefics3-transformers " : VLMTestInfo (
models = [ " HuggingFaceTB/SmolVLM-256M-Instruct " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <|begin_of_text|>User: { img_prompt } <end_of_utterance> \n Assistant: " , # noqa: E501
2025-08-28 14:51:24 +08:00
img_idx_to_prompt = lambda idx : " <image> " ,
max_model_len = 8192 ,
max_num_seqs = 2 ,
auto_cls = AutoModelForImageTextToText ,
hf_output_post_proc = model_utils . idefics3_trunc_hf_output ,
image_size_factors = [ ( 0.25 , 0.5 , 1.0 ) ] ,
vllm_runner_kwargs = {
" model_impl " : " transformers " ,
} ,
marks = [ pytest . mark . core_model ] ,
) ,
2025-07-20 15:25:50 +02:00
# Pixel values from processor are not 4D or 5D arrays
" qwen2_5_vl-transformers " : VLMTestInfo (
models = [ " Qwen/Qwen2.5-VL-3B-Instruct " ] ,
test_type = VLMTestType . IMAGE ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <|im_start|>User \n { img_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
2025-10-06 06:12:40 +01:00
img_idx_to_prompt = lambda idx : " <|vision_start|><|image_pad|><|vision_end|> " ,
2025-07-20 15:25:50 +02:00
max_model_len = 4096 ,
max_num_seqs = 2 ,
auto_cls = AutoModelForImageTextToText ,
vllm_output_post_proc = model_utils . qwen2_vllm_to_hf_output ,
image_size_factors = [ ( 0.25 , 0.2 , 0.15 ) ] ,
vllm_runner_kwargs = {
" model_impl " : " transformers " ,
} ,
2025-10-06 13:40:50 +02:00
marks = [ large_gpu_mark ( min_gb = 32 ) ] ,
2025-07-20 15:25:50 +02:00
) ,
2024-10-31 10:10:52 -06:00
#### Extended model tests
2025-03-25 18:22:52 +08:00
" aria " : VLMTestInfo (
models = [ " rhymes-ai/Aria " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <|im_start|>user \n { img_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
2025-03-25 18:22:52 +08:00
img_idx_to_prompt = lambda idx : " <fim_prefix><|img|><fim_suffix> \n " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
auto_cls = AutoModelForImageTextToText ,
2025-10-05 17:18:11 +01:00
single_image_prompts = IMAGE_ASSETS . prompts (
{
" stop_sign " : " <vlm_image>Please describe the image shortly. " ,
2025-10-06 06:12:40 +01:00
" cherry_blossom " : " <vlm_image>Please infer the season with reason. " ,
2025-10-05 17:18:11 +01:00
}
) ,
2025-10-06 06:12:40 +01:00
multi_image_prompt = " <vlm_image><vlm_image>Describe the two images shortly. " ,
2025-03-25 18:22:52 +08:00
stop_str = [ " <|im_end|> " ] ,
image_size_factors = [ ( 0.10 , 0.15 ) ] ,
max_tokens = 64 ,
marks = [ large_gpu_mark ( min_gb = 64 ) ] ,
) ,
2025-04-01 09:30:43 -07:00
" aya_vision " : VLMTestInfo (
models = [ " CohereForAI/aya-vision-8b " ] ,
2025-04-07 08:06:27 -07:00
test_type = ( VLMTestType . IMAGE ) ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <|START_OF_TURN_TOKEN|><|USER_TOKEN|> { img_prompt } <|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> " , # noqa: E501
single_image_prompts = IMAGE_ASSETS . prompts (
{
2025-10-06 06:12:40 +01:00
" stop_sign " : " <image>What ' s the content in the center of the image? " ,
" cherry_blossom " : " <image>What is the season? " ,
2025-10-05 17:18:11 +01:00
}
) ,
2025-10-06 06:12:40 +01:00
multi_image_prompt = " <image><image>Describe the two images in detail. " ,
2025-04-07 08:06:27 -07:00
max_model_len = 4096 ,
max_num_seqs = 2 ,
auto_cls = AutoModelForImageTextToText ,
vllm_runner_kwargs = { " mm_processor_kwargs " : { " crop_to_patches " : True } } ,
) ,
" aya_vision-multi_image " : VLMTestInfo (
models = [ " CohereForAI/aya-vision-8b " ] ,
test_type = ( VLMTestType . MULTI_IMAGE ) ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <|START_OF_TURN_TOKEN|><|USER_TOKEN|> { img_prompt } <|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> " , # noqa: E501
single_image_prompts = IMAGE_ASSETS . prompts (
{
2025-10-06 06:12:40 +01:00
" stop_sign " : " <image>What ' s the content in the center of the image? " ,
" cherry_blossom " : " <image>What is the season? " ,
2025-10-05 17:18:11 +01:00
}
) ,
2025-10-06 06:12:40 +01:00
multi_image_prompt = " <image><image>Describe the two images in detail. " ,
2025-04-07 08:06:27 -07:00
max_model_len = 4096 ,
2025-04-01 09:30:43 -07:00
max_num_seqs = 2 ,
auto_cls = AutoModelForImageTextToText ,
2025-04-07 08:06:27 -07:00
vllm_runner_kwargs = { " mm_processor_kwargs " : { " crop_to_patches " : True } } ,
marks = [ large_gpu_mark ( min_gb = 32 ) ] ,
2025-04-01 09:30:43 -07:00
) ,
2024-10-30 10:32:17 -06:00
" blip2 " : VLMTestInfo (
2025-07-07 00:54:36 +08:00
models = [ " Salesforce/blip2-opt-2.7b " ] ,
2024-10-30 10:32:17 -06:00
test_type = VLMTestType . IMAGE ,
prompt_formatter = lambda img_prompt : f " Question: { img_prompt } Answer: " ,
img_idx_to_prompt = lambda idx : " " ,
2025-03-18 02:35:17 +08:00
auto_cls = AutoModelForImageTextToText ,
2024-10-30 10:32:17 -06:00
vllm_output_post_proc = model_utils . blip2_vllm_to_hf_output ,
2025-06-04 19:49:20 +08:00
# FIXME: https://github.com/huggingface/transformers/pull/38510
marks = [ pytest . mark . skip ( " Model is broken " ) ] ,
2024-10-30 10:32:17 -06:00
) ,
" chameleon " : VLMTestInfo (
models = [ " facebook/chameleon-7b " ] ,
test_type = VLMTestType . IMAGE ,
prompt_formatter = lambda img_prompt : f " USER: { img_prompt } \n ASSISTANT: " ,
max_model_len = 4096 ,
2024-12-31 13:17:22 -08:00
max_num_seqs = 2 ,
2025-03-18 02:35:17 +08:00
auto_cls = AutoModelForImageTextToText ,
2024-10-30 10:32:17 -06:00
# For chameleon, we only compare the sequences
2025-10-05 17:18:11 +01:00
vllm_output_post_proc = lambda vllm_output , model : vllm_output [ : 2 ] ,
hf_output_post_proc = lambda hf_output , model : hf_output [ : 2 ] ,
2024-10-30 10:32:17 -06:00
comparator = check_outputs_equal ,
max_tokens = 8 ,
dtype = " bfloat16 " ,
) ,
2025-01-12 16:17:24 +08:00
" deepseek_vl_v2 " : VLMTestInfo (
2025-10-05 17:18:11 +01:00
models = [ " Isotr0py/deepseek-vl2-tiny " ] , # model repo using dynamic module
2025-01-12 16:17:24 +08:00
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <|User|>: { img_prompt } \n \n <|Assistant|>: " , # noqa: E501
2025-01-12 16:17:24 +08:00
max_model_len = 4096 ,
max_num_seqs = 2 ,
2025-10-05 17:18:11 +01:00
single_image_prompts = IMAGE_ASSETS . prompts (
{
2025-10-06 06:12:40 +01:00
" stop_sign " : " <image> \n What ' s the content in the center of the image? " ,
2025-10-05 17:18:11 +01:00
" cherry_blossom " : " <image> \n Please infer the season with reason in details. " , # noqa: E501
}
) ,
multi_image_prompt = " image_1:<image> \n image_2:<image> \n Which image can we see the car and the tower? " , # noqa: E501
2025-01-12 16:17:24 +08:00
patch_hf_runner = model_utils . deepseekvl2_patch_hf_runner ,
hf_output_post_proc = model_utils . deepseekvl2_trunc_hf_output ,
2025-10-06 06:12:40 +01:00
stop_str = [ " <| end▁of▁sentence| > " , " <| begin▁of▁sentence| > " ] ,
2025-10-05 17:18:11 +01:00
image_size_factors = [ ( ) , ( 1.0 , ) , ( 1.0 , 1.0 , 1.0 ) , ( 0.1 , 0.5 , 1.0 ) ] ,
2025-01-12 16:17:24 +08:00
) ,
2024-10-30 10:32:17 -06:00
" fuyu " : VLMTestInfo (
models = [ " adept/fuyu-8b " ] ,
test_type = VLMTestType . IMAGE ,
prompt_formatter = lambda img_prompt : f " { img_prompt } \n " ,
img_idx_to_prompt = lambda idx : " " ,
max_model_len = 2048 ,
max_num_seqs = 2 ,
2025-03-18 02:35:17 +08:00
auto_cls = AutoModelForImageTextToText ,
2024-10-30 10:32:17 -06:00
use_tokenizer_eos = True ,
vllm_output_post_proc = model_utils . fuyu_vllm_to_hf_output ,
num_logprobs = 10 ,
image_size_factors = [ ( ) , ( 0.25 , ) , ( 0.25 , 0.25 , 0.25 ) , ( 0.25 , 0.2 , 0.15 ) ] ,
2025-09-21 01:50:58 +08:00
marks = [ large_gpu_mark ( min_gb = 32 ) ] ,
2024-10-30 10:32:17 -06:00
) ,
2025-10-22 14:05:34 -03:00
" gemma3 " : VLMTestInfo (
models = [ " google/gemma-3-4b-it " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
prompt_formatter = lambda img_prompt : f " <bos><start_of_turn>user \n { img_prompt } <end_of_turn> \n <start_of_turn>model \n " , # noqa: E501
single_image_prompts = IMAGE_ASSETS . prompts (
{
" stop_sign " : " <start_of_image>What ' s the content in the center of the image? " , # noqa: E501
" cherry_blossom " : " <start_of_image>What is the season? " ,
}
) ,
multi_image_prompt = " <start_of_image><start_of_image>Describe the two images in detail. " , # noqa: E501
max_model_len = 4096 ,
max_num_seqs = 2 ,
auto_cls = AutoModelForImageTextToText ,
vllm_runner_kwargs = { " mm_processor_kwargs " : { " do_pan_and_scan " : True } } ,
patch_hf_runner = model_utils . gemma3_patch_hf_runner ,
num_logprobs = 10 ,
) ,
2025-02-13 22:19:15 +08:00
" glm4v " : VLMTestInfo (
2025-08-04 14:51:20 +08:00
models = [ " zai-org/glm-4v-9b " ] ,
2024-10-30 10:32:17 -06:00
test_type = VLMTestType . IMAGE ,
2025-10-06 06:12:40 +01:00
prompt_formatter = lambda img_prompt : f " <|user|> \n { img_prompt } <|assistant|> " ,
2025-10-05 17:18:11 +01:00
single_image_prompts = IMAGE_ASSETS . prompts (
{
" stop_sign " : " <|begin_of_image|><|endoftext|><|end_of_image|>What ' s the content in the center of the image? " , # noqa: E501
" cherry_blossom " : " <|begin_of_image|><|endoftext|><|end_of_image|>What is the season? " , # noqa: E501
}
) ,
2024-10-30 10:32:17 -06:00
max_model_len = 2048 ,
max_num_seqs = 2 ,
get_stop_token_ids = lambda tok : [ 151329 , 151336 , 151338 ] ,
2025-03-13 19:37:17 +08:00
patch_hf_runner = model_utils . glm4v_patch_hf_runner ,
# The image embeddings match with HF but the outputs of the language
# decoder are only consistent up to 2 decimal places.
# So, we need to reduce the number of tokens for the test to pass.
max_tokens = 8 ,
num_logprobs = 10 ,
2024-12-28 01:22:48 +08:00
marks = [ large_gpu_mark ( min_gb = 32 ) ] ,
2024-11-03 18:15:36 -06:00
) ,
2025-07-01 20:48:26 +08:00
" glm4_1v " : VLMTestInfo (
2025-08-04 14:51:20 +08:00
models = [ " zai-org/GLM-4.1V-9B-Thinking " ] ,
2025-07-01 20:48:26 +08:00
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
2025-10-06 06:12:40 +01:00
prompt_formatter = lambda img_prompt : f " <|user|> \n { img_prompt } <|assistant|> " ,
img_idx_to_prompt = lambda idx : " <|begin_of_image|><|image|><|end_of_image|> " ,
video_idx_to_prompt = lambda idx : " <|begin_of_video|><|video|><|end_of_video|> " ,
2025-07-01 20:48:26 +08:00
max_model_len = 2048 ,
max_num_seqs = 2 ,
get_stop_token_ids = lambda tok : [ 151329 , 151336 , 151338 ] ,
num_logprobs = 10 ,
image_size_factors = [ ( ) , ( 0.25 , ) , ( 0.25 , 0.25 , 0.25 ) , ( 0.25 , 0.2 , 0.15 ) ] ,
auto_cls = AutoModelForImageTextToText ,
2025-07-12 11:53:07 +08:00
marks = [ large_gpu_mark ( min_gb = 32 ) ] ,
2025-07-01 20:48:26 +08:00
) ,
" glm4_1v-video " : VLMTestInfo (
2025-08-04 14:51:20 +08:00
models = [ " zai-org/GLM-4.1V-9B-Thinking " ] ,
2025-07-01 20:48:26 +08:00
# GLM4.1V require include video metadata for input
test_type = VLMTestType . CUSTOM_INPUTS ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
auto_cls = AutoModelForImageTextToText ,
patch_hf_runner = model_utils . glm4_1v_patch_hf_runner ,
2025-10-05 17:18:11 +01:00
custom_test_opts = [
CustomTestOptions (
inputs = custom_inputs . video_with_metadata_glm4_1v ( ) ,
limit_mm_per_prompt = { " video " : 1 } ,
)
] ,
2025-07-12 11:53:07 +08:00
marks = [ large_gpu_mark ( min_gb = 32 ) ] ,
2025-07-01 20:48:26 +08:00
) ,
2024-11-03 18:15:36 -06:00
" h2ovl " : VLMTestInfo (
2025-10-05 17:18:11 +01:00
models = [
2024-11-03 18:15:36 -06:00
" h2oai/h2ovl-mississippi-800m " ,
2025-07-07 00:54:36 +08:00
" h2oai/h2ovl-mississippi-2b " ,
2024-11-03 18:15:36 -06:00
] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
2025-10-06 06:12:40 +01:00
prompt_formatter = lambda img_prompt : f " <|prompt|> { img_prompt } <|end|><|answer|> " ,
2025-10-05 17:18:11 +01:00
single_image_prompts = IMAGE_ASSETS . prompts (
{
2025-10-06 06:12:40 +01:00
" stop_sign " : " <image> \n What ' s the content in the center of the image? " ,
2025-10-05 17:18:11 +01:00
" cherry_blossom " : " <image> \n What is the season? " ,
}
) ,
2024-11-03 18:15:36 -06:00
multi_image_prompt = " Image-1: <image> \n Image-2: <image> \n Describe the two images in short. " , # noqa: E501
max_model_len = 8192 ,
use_tokenizer_eos = True ,
2025-02-04 16:44:52 +08:00
num_logprobs = 10 ,
2024-11-03 18:15:36 -06:00
patch_hf_runner = model_utils . h2ovl_patch_hf_runner ,
2024-10-30 10:32:17 -06:00
) ,
2024-11-10 03:39:14 +08:00
" idefics3 " : VLMTestInfo (
2025-02-04 20:00:51 +08:00
models = [ " HuggingFaceTB/SmolVLM-256M-Instruct " ] ,
2024-11-10 03:39:14 +08:00
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <|begin_of_text|>User: { img_prompt } <end_of_utterance> \n Assistant: " , # noqa: E501
2024-11-10 03:39:14 +08:00
img_idx_to_prompt = lambda idx : " <image> " ,
max_model_len = 8192 ,
max_num_seqs = 2 ,
2025-03-18 02:35:17 +08:00
auto_cls = AutoModelForImageTextToText ,
2025-02-04 20:00:51 +08:00
hf_output_post_proc = model_utils . idefics3_trunc_hf_output ,
2024-11-10 03:39:14 +08:00
) ,
2024-10-30 10:32:17 -06:00
" intern_vl " : VLMTestInfo (
models = [
" OpenGVLab/InternVL2-1B " ,
" OpenGVLab/InternVL2-2B " ,
2025-06-04 19:49:20 +08:00
# FIXME: Config cannot be loaded in transformers 4.52
# "OpenGVLab/Mono-InternVL-2B",
2024-10-30 10:32:17 -06:00
] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <|im_start|>User \n { img_prompt } <|im_end|> \n <|im_start|>Assistant \n " , # noqa: E501
single_image_prompts = IMAGE_ASSETS . prompts (
{
2025-10-06 06:12:40 +01:00
" stop_sign " : " <image> \n What ' s the content in the center of the image? " ,
2025-10-05 17:18:11 +01:00
" cherry_blossom " : " <image> \n What is the season? " ,
}
) ,
2024-10-30 10:32:17 -06:00
multi_image_prompt = " Image-1: <image> \n Image-2: <image> \n Describe the two images in short. " , # noqa: E501
max_model_len = 4096 ,
use_tokenizer_eos = True ,
patch_hf_runner = model_utils . internvl_patch_hf_runner ,
) ,
2025-05-25 12:51:25 +08:00
" intern_vl-video " : VLMTestInfo (
models = [
" OpenGVLab/InternVL3-1B " ,
] ,
test_type = VLMTestType . VIDEO ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <|im_start|>User \n { img_prompt } <|im_end|> \n <|im_start|>Assistant \n " , # noqa: E501
2025-05-25 12:51:25 +08:00
video_idx_to_prompt = lambda idx : " <video> " ,
max_model_len = 8192 ,
use_tokenizer_eos = True ,
patch_hf_runner = model_utils . internvl_patch_hf_runner ,
) ,
2025-08-27 22:45:17 +08:00
" intern_vl-hf " : VLMTestInfo (
models = [ " OpenGVLab/InternVL3-1B-hf " ] ,
test_type = (
VLMTestType . IMAGE ,
VLMTestType . MULTI_IMAGE ,
VLMTestType . VIDEO ,
) ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <|im_start|>User \n { img_prompt } <|im_end|> \n <|im_start|>Assistant \n " , # noqa: E501
2025-08-27 22:45:17 +08:00
img_idx_to_prompt = lambda idx : " <IMG_CONTEXT> " ,
video_idx_to_prompt = lambda idx : " <video> " ,
max_model_len = 8192 ,
use_tokenizer_eos = True ,
auto_cls = AutoModelForImageTextToText ,
) ,
2025-04-15 05:41:48 +08:00
" kimi_vl " : VLMTestInfo (
models = [ " moonshotai/Kimi-VL-A3B-Instruct " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <|im_user|>user<|im_middle|> { img_prompt } <|im_end|><|im_assistant|>assistant<|im_middle|> " , # noqa: E501
2025-04-15 05:41:48 +08:00
img_idx_to_prompt = lambda _ : " <|media_start|>image<|media_content|><|media_pad|><|media_end|> " , # noqa: E501
max_model_len = 8192 ,
max_num_seqs = 2 ,
dtype = " bfloat16 " ,
tensor_parallel_size = 1 ,
vllm_output_post_proc = model_utils . kimiv_vl_vllm_to_hf_output ,
marks = [ large_gpu_mark ( min_gb = 48 ) ] ,
) ,
2025-04-07 08:06:27 -07:00
" llama4 " : VLMTestInfo (
models = [ " meta-llama/Llama-4-Scout-17B-16E-Instruct " ] ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <|begin_of_text|><|header_start|>user<|header_end|> \n \n { img_prompt } <|eot|><|header_start|>assistant<|header_end|> \n \n " , # noqa: E501
2025-04-07 08:06:27 -07:00
img_idx_to_prompt = lambda _ : " <|image|> " ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
distributed_executor_backend = " mp " ,
2025-10-05 17:18:11 +01:00
image_size_factors = [ ( 0.25 , 0.5 , 1.0 ) ] ,
2025-04-07 08:06:27 -07:00
hf_model_kwargs = { " device_map " : " auto " } ,
max_model_len = 8192 ,
max_num_seqs = 4 ,
dtype = " bfloat16 " ,
auto_cls = AutoModelForImageTextToText ,
2025-04-07 19:43:41 -07:00
tensor_parallel_size = 4 ,
marks = multi_gpu_marks ( num_gpus = 4 ) ,
2025-04-07 08:06:27 -07:00
) ,
2024-10-30 10:32:17 -06:00
" llava_next " : VLMTestInfo (
models = [ " llava-hf/llava-v1.6-mistral-7b-hf " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . CUSTOM_INPUTS ) ,
prompt_formatter = lambda img_prompt : f " [INST] { img_prompt } [/INST] " ,
max_model_len = 10240 ,
2025-03-18 02:35:17 +08:00
auto_cls = AutoModelForImageTextToText ,
2024-10-30 10:32:17 -06:00
vllm_output_post_proc = model_utils . llava_image_vllm_to_hf_output ,
2025-10-05 17:18:11 +01:00
custom_test_opts = [
CustomTestOptions (
inputs = custom_inputs . multi_image_multi_aspect_ratio_inputs (
formatter = lambda img_prompt : f " [INST] { img_prompt } [/INST] "
) ,
limit_mm_per_prompt = { " image " : 4 } ,
)
] ,
2024-10-30 10:32:17 -06:00
) ,
2025-01-04 19:40:53 +08:00
" llava_onevision " : VLMTestInfo (
2024-10-30 10:32:17 -06:00
models = [ " llava-hf/llava-onevision-qwen2-0.5b-ov-hf " ] ,
test_type = VLMTestType . CUSTOM_INPUTS ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda vid_prompt : f " <|im_start|>user \n { vid_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
2024-10-30 10:32:17 -06:00
num_video_frames = 16 ,
max_model_len = 16384 ,
2025-10-05 17:18:11 +01:00
hf_model_kwargs = model_utils . llava_onevision_hf_model_kwargs (
" llava-hf/llava-onevision-qwen2-0.5b-ov-hf "
2025-10-06 06:12:40 +01:00
) ,
2025-09-17 20:19:15 +08:00
auto_cls = AutoModelForImageTextToText ,
2024-10-30 10:32:17 -06:00
vllm_output_post_proc = model_utils . llava_onevision_vllm_to_hf_output ,
2025-10-05 17:18:11 +01:00
custom_test_opts = [
CustomTestOptions (
inputs = custom_inputs . multi_video_multi_aspect_ratio_inputs (
formatter = lambda vid_prompt : f " <|im_start|>user \n { vid_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
) ,
limit_mm_per_prompt = { " video " : 4 } ,
)
] ,
2024-10-30 10:32:17 -06:00
) ,
" llava_next_video " : VLMTestInfo (
models = [ " llava-hf/LLaVA-NeXT-Video-7B-hf " ] ,
test_type = VLMTestType . VIDEO ,
prompt_formatter = lambda vid_prompt : f " USER: { vid_prompt } ASSISTANT: " ,
num_video_frames = 16 ,
max_model_len = 4096 ,
2025-03-25 18:22:52 +08:00
max_num_seqs = 2 ,
2025-09-17 20:19:15 +08:00
auto_cls = AutoModelForImageTextToText ,
2024-10-30 10:32:17 -06:00
vllm_output_post_proc = model_utils . llava_video_vllm_to_hf_output ,
) ,
2024-12-08 01:10:05 +08:00
" mantis " : VLMTestInfo (
models = [ " TIGER-Lab/Mantis-8B-siglip-llama3 " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
prompt_formatter = lambda img_prompt : f " <|start_header_id|>user<|end_header_id|> \n \n { img_prompt } <|eot_id|><|start_header_id|>assistant<|end_header_id|> \n \n " , # noqa: E501
max_model_len = 4096 ,
get_stop_token_ids = lambda tok : [ 128009 ] ,
2025-03-18 02:35:17 +08:00
auto_cls = AutoModelForImageTextToText ,
2024-12-08 01:10:05 +08:00
vllm_output_post_proc = model_utils . mantis_vllm_to_hf_output ,
patch_hf_runner = model_utils . mantis_patch_hf_runner ,
) ,
2024-11-29 12:47:06 +08:00
" minicpmv_25 " : VLMTestInfo (
2024-10-30 10:32:17 -06:00
models = [ " openbmb/MiniCPM-Llama3-V-2_5 " ] ,
2024-11-29 12:47:06 +08:00
test_type = VLMTestType . IMAGE ,
2024-10-30 10:32:17 -06:00
prompt_formatter = lambda img_prompt : f " <|begin_of_text|><|start_header_id|>user<|end_header_id|> \n \n { img_prompt } <|eot_id|><|start_header_id|>assistant<|end_header_id|> \n \n " , # noqa: E501
img_idx_to_prompt = lambda idx : " (<image>./</image>) \n " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
get_stop_token_ids = lambda tok : [ tok . eos_id , tok . eot_id ] ,
2024-11-29 12:47:06 +08:00
hf_output_post_proc = model_utils . minicpmv_trunc_hf_output ,
2025-03-19 13:49:33 +08:00
patch_hf_runner = model_utils . minicpmv_25_patch_hf_runner ,
2025-05-01 14:03:08 +08:00
# FIXME: https://huggingface.co/openbmb/MiniCPM-V-2_6/discussions/55
marks = [ pytest . mark . skip ( " HF import fails " ) ] ,
2024-11-29 12:47:06 +08:00
) ,
2025-01-29 17:24:59 +08:00
" minicpmo_26 " : VLMTestInfo (
models = [ " openbmb/MiniCPM-o-2_6 " ] ,
2025-03-30 18:20:42 +08:00
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
2025-03-25 18:22:52 +08:00
prompt_formatter = lambda img_prompt : f " <|begin_of_text|><|start_header_id|>user<|end_header_id|> \n \n { img_prompt } <|eot_id|><|start_header_id|>assistant<|end_header_id|> \n \n " , # noqa: E501
img_idx_to_prompt = lambda idx : " (<image>./</image>) \n " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
2025-10-05 17:18:11 +01:00
get_stop_token_ids = lambda tok : tok . convert_tokens_to_ids (
[ " <|im_end|> " , " <|endoftext|> " ]
2025-10-06 06:12:40 +01:00
) ,
2025-03-25 18:22:52 +08:00
hf_output_post_proc = model_utils . minicpmv_trunc_hf_output ,
patch_hf_runner = model_utils . minicpmo_26_patch_hf_runner ,
2025-08-14 01:03:05 +08:00
# FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
2025-05-01 14:03:08 +08:00
marks = [ pytest . mark . skip ( " HF import fails " ) ] ,
2025-03-25 18:22:52 +08:00
) ,
2024-11-29 12:47:06 +08:00
" minicpmv_26 " : VLMTestInfo (
models = [ " openbmb/MiniCPM-V-2_6 " ] ,
2025-03-30 18:20:42 +08:00
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
2025-03-25 18:22:52 +08:00
prompt_formatter = lambda img_prompt : f " <|begin_of_text|><|start_header_id|>user<|end_header_id|> \n \n { img_prompt } <|eot_id|><|start_header_id|>assistant<|end_header_id|> \n \n " , # noqa: E501
img_idx_to_prompt = lambda idx : " (<image>./</image>) \n " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
2025-10-05 17:18:11 +01:00
get_stop_token_ids = lambda tok : tok . convert_tokens_to_ids (
[ " <|im_end|> " , " <|endoftext|> " ]
2025-10-06 06:12:40 +01:00
) ,
2025-03-25 18:22:52 +08:00
hf_output_post_proc = model_utils . minicpmv_trunc_hf_output ,
patch_hf_runner = model_utils . minicpmv_26_patch_hf_runner ,
) ,
2025-04-29 12:05:50 +08:00
" minimax_vl_01 " : VLMTestInfo (
models = [ " MiniMaxAI/MiniMax-VL-01 " ] ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <beginning_of_sentence>user: { img_prompt } assistant:<end_of_sentence> " , # noqa: E501
2025-04-29 12:05:50 +08:00
img_idx_to_prompt = lambda _ : " <image> " ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
max_model_len = 8192 ,
max_num_seqs = 4 ,
dtype = " bfloat16 " ,
hf_output_post_proc = model_utils . minimax_vl_01_hf_output ,
patch_hf_runner = model_utils . minimax_vl_01_patch_hf_runner ,
auto_cls = AutoModelForImageTextToText ,
marks = [ large_gpu_mark ( min_gb = 80 ) ] ,
) ,
2025-01-06 23:22:25 +08:00
" molmo " : VLMTestInfo (
models = [ " allenai/Molmo-7B-D-0924 " ] ,
2025-03-26 11:26:33 +08:00
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
2025-02-13 20:34:00 +08:00
prompt_formatter = identity ,
2025-01-06 23:22:25 +08:00
max_model_len = 4096 ,
max_num_seqs = 2 ,
2025-02-13 20:34:00 +08:00
patch_hf_runner = model_utils . molmo_patch_hf_runner ,
2025-01-06 23:22:25 +08:00
) ,
2025-05-12 08:56:30 +08:00
" ovis1_6-gemma2 " : VLMTestInfo (
models = [ " AIDC-AI/Ovis1.6-Gemma2-9B " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <bos><start_of_turn>user \n { img_prompt } <end_of_turn> \n <start_of_turn>model \n " , # noqa: E501
2025-10-06 06:12:40 +01:00
img_idx_to_prompt = lambda idx : " <image> \n " ,
2025-05-12 08:56:30 +08:00
max_model_len = 4096 ,
max_num_seqs = 2 ,
dtype = " half " ,
# use sdpa mode for hf runner since ovis2 didn't work with flash_attn
hf_model_kwargs = { " llm_attn_implementation " : " sdpa " } ,
patch_hf_runner = model_utils . ovis_patch_hf_runner ,
marks = [ large_gpu_mark ( min_gb = 32 ) ] ,
) ,
2025-04-30 09:33:29 +02:00
" ovis2 " : VLMTestInfo (
models = [ " AIDC-AI/Ovis2-1B " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <|im_start|>system \n You are a helpful assistant.<|im_end|> \n <|im_start|>user \n { img_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
2025-10-06 06:12:40 +01:00
img_idx_to_prompt = lambda idx : " <image> \n " ,
2025-04-30 09:33:29 +02:00
max_model_len = 4096 ,
max_num_seqs = 2 ,
dtype = " half " ,
# use sdpa mode for hf runner since ovis2 didn't work with flash_attn
hf_model_kwargs = { " llm_attn_implementation " : " sdpa " } ,
2025-05-12 08:56:30 +08:00
patch_hf_runner = model_utils . ovis_patch_hf_runner ,
2025-04-30 09:33:29 +02:00
) ,
2025-08-19 21:12:59 +08:00
" ovis2_5 " : VLMTestInfo (
models = [ " AIDC-AI/Ovis2.5-2B " ] ,
2025-10-05 17:18:11 +01:00
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE , VLMTestType . VIDEO ) ,
prompt_formatter = lambda img_prompt : f " <|im_start|>system \n You are a helpful assistant.<|im_end|> \n <|im_start|>user \n { img_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
2025-10-06 06:12:40 +01:00
img_idx_to_prompt = lambda idx : " <image> \n " ,
2025-08-19 21:12:59 +08:00
video_idx_to_prompt = lambda idx : " <video> \n " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
dtype = " half " ,
num_logprobs = 10 ,
patch_hf_runner = model_utils . ovis2_5_patch_hf_runner ,
2025-08-23 01:46:34 +08:00
hf_model_kwargs = { " revision " : " refs/pr/5 " } ,
2025-08-19 21:12:59 +08:00
) ,
2025-11-15 00:07:20 +08:00
" paddleocr_vl " : VLMTestInfo (
models = [ " PaddlePaddle/PaddleOCR-VL " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
prompt_formatter = lambda img_prompt : f " USER: { img_prompt } \n ASSISTANT: " ,
img_idx_to_prompt = lambda idx : (
" <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|> "
) ,
multi_image_prompt = (
" Image-1: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|> \n "
" Image-2: <|IMAGE_START|><|IMAGE_PLACEHOLDER|><|IMAGE_END|> \n "
" Describe these two images separately. "
) ,
max_model_len = 8192 ,
max_num_seqs = 2 ,
auto_cls = AutoModelForCausalLM ,
image_size_factors = [ ( ) , ( 0.25 , ) ] ,
2025-11-30 20:38:06 -08:00
marks = [
pytest . mark . skipif (
Version ( TRANSFORMERS_VERSION ) == Version ( " 4.57.3 " ) ,
reason = " This model is broken in Transformers v4.57.3 " ,
)
] ,
2025-11-15 00:07:20 +08:00
) ,
2025-04-10 14:03:33 +08:00
" phi3v " : VLMTestInfo (
models = [ " microsoft/Phi-3.5-vision-instruct " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <|user|> \n { img_prompt } <|end|> \n <|assistant|> \n " , # noqa: E501
2025-04-10 14:03:33 +08:00
img_idx_to_prompt = lambda idx : f " <|image_ { idx } |> \n " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
2025-07-28 10:42:40 +08:00
runner = " generate " ,
2025-04-11 12:57:16 +08:00
# use sdpa mode for hf runner since phi3v didn't work with flash_attn
hf_model_kwargs = { " _attn_implementation " : " sdpa " } ,
2025-04-10 14:03:33 +08:00
use_tokenizer_eos = True ,
vllm_output_post_proc = model_utils . phi3v_vllm_to_hf_output ,
num_logprobs = 10 ,
) ,
2024-11-01 09:55:29 -04:00
" pixtral_hf " : VLMTestInfo (
models = [ " nm-testing/pixtral-12b-FP8-dynamic " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
prompt_formatter = lambda img_prompt : f " <s>[INST] { img_prompt } [/INST] " ,
img_idx_to_prompt = lambda idx : " [IMG] " ,
max_model_len = 8192 ,
max_num_seqs = 2 ,
2025-03-18 02:35:17 +08:00
auto_cls = AutoModelForImageTextToText ,
2024-11-10 03:39:14 +08:00
marks = [ large_gpu_mark ( min_gb = 48 ) ] ,
2024-11-01 09:55:29 -04:00
) ,
2025-02-13 22:19:15 +08:00
" qwen_vl " : VLMTestInfo (
2024-10-30 10:32:17 -06:00
models = [ " Qwen/Qwen-VL " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
prompt_formatter = identity ,
img_idx_to_prompt = lambda idx : f " Picture { idx } : <img></img> \n " ,
max_model_len = 1024 ,
max_num_seqs = 2 ,
vllm_output_post_proc = model_utils . qwen_vllm_to_hf_output ,
prompt_path_encoder = model_utils . qwen_prompt_path_encoder ,
) ,
2025-03-31 16:59:37 +01:00
" qwen2_vl " : VLMTestInfo (
models = [ " Qwen/Qwen2-VL-2B-Instruct " ] ,
2025-10-05 17:18:11 +01:00
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE , VLMTestType . VIDEO ) ,
prompt_formatter = lambda img_prompt : f " <|im_start|>User \n { img_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
2025-10-06 06:12:40 +01:00
img_idx_to_prompt = lambda idx : " <|vision_start|><|image_pad|><|vision_end|> " ,
video_idx_to_prompt = lambda idx : " <|vision_start|><|video_pad|><|vision_end|> " ,
2025-10-05 17:18:11 +01:00
multi_image_prompt = " Picture 1: <vlm_image> \n Picture 2: <vlm_image> \n Describe these two images with one paragraph respectively. " , # noqa: E501
2025-03-31 16:59:37 +01:00
max_model_len = 4096 ,
max_num_seqs = 2 ,
2025-09-17 20:19:15 +08:00
auto_cls = AutoModelForImageTextToText ,
2025-03-31 16:59:37 +01:00
vllm_output_post_proc = model_utils . qwen2_vllm_to_hf_output ,
image_size_factors = [ ( ) , ( 0.25 , ) , ( 0.25 , 0.25 , 0.25 ) , ( 0.25 , 0.2 , 0.15 ) ] ,
marks = [ pytest . mark . cpu_model ] ,
) ,
2025-03-29 11:39:21 +08:00
" skywork_r1v " : VLMTestInfo (
models = [ " Skywork/Skywork-R1V-38B " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <| begin▁of▁sentence| ><| User| > \n { img_prompt } <| Assistant| ><think> \n " , # noqa: E501
single_image_prompts = IMAGE_ASSETS . prompts (
{
2025-10-06 06:12:40 +01:00
" stop_sign " : " <image> \n What ' s the content in the center of the image? " ,
2025-10-05 17:18:11 +01:00
" cherry_blossom " : " <image> \n What is the season? " ,
}
) ,
2025-10-06 06:12:40 +01:00
multi_image_prompt = " <image> \n <image> \n Describe the two images in short. " ,
2025-03-29 11:39:21 +08:00
max_model_len = 4096 ,
use_tokenizer_eos = True ,
patch_hf_runner = model_utils . skyworkr1v_patch_hf_runner ,
marks = [ large_gpu_mark ( min_gb = 80 ) ] ,
) ,
2025-04-09 10:12:17 +08:00
" smolvlm " : VLMTestInfo (
models = [ " HuggingFaceTB/SmolVLM2-2.2B-Instruct " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
2025-08-01 13:44:10 +08:00
prompt_formatter = lambda img_prompt : f " <|im_start|>User: { img_prompt } <end_of_utterance> \n Assistant: " , # noqa: E501
2025-04-09 10:12:17 +08:00
img_idx_to_prompt = lambda idx : " <image> " ,
max_model_len = 8192 ,
max_num_seqs = 2 ,
auto_cls = AutoModelForImageTextToText ,
hf_output_post_proc = model_utils . smolvlm_trunc_hf_output ,
2025-10-09 14:44:16 +08:00
num_logprobs = 10 ,
2025-04-09 10:12:17 +08:00
) ,
2025-08-01 13:44:10 +08:00
" tarsier " : VLMTestInfo (
models = [ " omni-research/Tarsier-7b " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
prompt_formatter = lambda img_prompt : f " USER: { img_prompt } ASSISTANT: " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
auto_cls = AutoModelForImageTextToText ,
patch_hf_runner = model_utils . tarsier_patch_hf_runner ,
) ,
" tarsier2 " : VLMTestInfo (
models = [ " omni-research/Tarsier2-Recap-7b " ] ,
test_type = (
VLMTestType . IMAGE ,
VLMTestType . MULTI_IMAGE ,
VLMTestType . VIDEO ,
) ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <|im_start|>system \n You are a helpful assistant.<|im_end|> \n <|im_start|>user \n { img_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
2025-10-06 06:12:40 +01:00
img_idx_to_prompt = lambda idx : " <|vision_start|><|image_pad|><|vision_end|> " ,
video_idx_to_prompt = lambda idx : " <|vision_start|><|video_pad|><|vision_end|> " ,
2025-08-01 13:44:10 +08:00
max_model_len = 4096 ,
max_num_seqs = 2 ,
auto_cls = AutoModelForImageTextToText ,
image_size_factors = [ ( ) , ( 0.25 , ) , ( 0.25 , 0.25 , 0.25 ) , ( 0.25 , 0.2 , 0.15 ) ] ,
marks = [ pytest . mark . skip ( " Model initialization hangs " ) ] ,
) ,
2024-10-30 10:32:17 -06:00
### Tensor parallel / multi-gpu broadcast tests
2024-12-12 06:18:16 +08:00
" chameleon-broadcast " : VLMTestInfo (
2024-10-30 10:32:17 -06:00
models = [ " facebook/chameleon-7b " ] ,
prompt_formatter = lambda img_prompt : f " USER: { img_prompt } \n ASSISTANT: " ,
max_model_len = 4096 ,
2025-03-18 02:35:17 +08:00
auto_cls = AutoModelForImageTextToText ,
2025-10-05 17:18:11 +01:00
vllm_output_post_proc = lambda vllm_output , model : vllm_output [ : 2 ] ,
hf_output_post_proc = lambda hf_output , model : hf_output [ : 2 ] ,
2024-10-30 10:32:17 -06:00
comparator = check_outputs_equal ,
2024-12-12 06:18:16 +08:00
marks = multi_gpu_marks ( num_gpus = 2 ) ,
2025-10-05 17:18:11 +01:00
* * COMMON_BROADCAST_SETTINGS , # type: ignore
2024-10-30 10:32:17 -06:00
) ,
2024-12-12 06:18:16 +08:00
" llava-broadcast " : VLMTestInfo (
2024-10-30 10:32:17 -06:00
models = [ " llava-hf/llava-1.5-7b-hf " ] ,
prompt_formatter = lambda img_prompt : f " USER: { img_prompt } \n ASSISTANT: " ,
max_model_len = 4096 ,
2025-03-18 02:35:17 +08:00
auto_cls = AutoModelForImageTextToText ,
2024-10-30 10:32:17 -06:00
vllm_output_post_proc = model_utils . llava_image_vllm_to_hf_output ,
2024-12-12 06:18:16 +08:00
marks = multi_gpu_marks ( num_gpus = 2 ) ,
2025-10-05 17:18:11 +01:00
* * COMMON_BROADCAST_SETTINGS , # type: ignore
2024-10-30 10:32:17 -06:00
) ,
2024-12-12 06:18:16 +08:00
" llava_next-broadcast " : VLMTestInfo (
2024-10-30 10:32:17 -06:00
models = [ " llava-hf/llava-v1.6-mistral-7b-hf " ] ,
prompt_formatter = lambda img_prompt : f " [INST] { img_prompt } [/INST] " ,
max_model_len = 10240 ,
2025-03-18 02:35:17 +08:00
auto_cls = AutoModelForImageTextToText ,
2024-10-30 10:32:17 -06:00
vllm_output_post_proc = model_utils . llava_image_vllm_to_hf_output ,
2024-12-12 06:18:16 +08:00
marks = multi_gpu_marks ( num_gpus = 2 ) ,
2025-10-05 17:18:11 +01:00
* * COMMON_BROADCAST_SETTINGS , # type: ignore
2024-10-30 10:32:17 -06:00
) ,
### Custom input edge-cases for specific models
" intern_vl-diff-patches " : VLMTestInfo (
models = [ " OpenGVLab/InternVL2-2B " ] ,
2025-10-05 17:18:11 +01:00
prompt_formatter = lambda img_prompt : f " <|im_start|>User \n { img_prompt } <|im_end|> \n <|im_start|>Assistant \n " , # noqa: E501
2024-10-30 10:32:17 -06:00
test_type = VLMTestType . CUSTOM_INPUTS ,
max_model_len = 4096 ,
use_tokenizer_eos = True ,
patch_hf_runner = model_utils . internvl_patch_hf_runner ,
custom_test_opts = [
CustomTestOptions (
inputs = inp ,
limit_mm_per_prompt = { " image " : 2 } ,
2025-10-05 17:18:11 +01:00
)
for inp in custom_inputs . different_patch_input_cases_internvl ( )
2024-10-30 10:32:17 -06:00
] ,
) ,
2025-01-04 19:40:53 +08:00
" llava_onevision-multiple-images " : VLMTestInfo (
2024-10-30 10:32:17 -06:00
models = [ " llava-hf/llava-onevision-qwen2-0.5b-ov-hf " ] ,
test_type = VLMTestType . CUSTOM_INPUTS ,
max_model_len = 16384 ,
max_num_seqs = 2 ,
2025-09-17 20:19:15 +08:00
auto_cls = AutoModelForImageTextToText ,
2025-10-05 17:18:11 +01:00
hf_model_kwargs = model_utils . llava_onevision_hf_model_kwargs (
" llava-hf/llava-onevision-qwen2-0.5b-ov-hf "
2025-10-06 06:12:40 +01:00
) ,
2024-10-30 10:32:17 -06:00
vllm_output_post_proc = model_utils . llava_onevision_vllm_to_hf_output ,
2025-10-05 17:18:11 +01:00
custom_test_opts = [
CustomTestOptions (
inputs = custom_inputs . multi_image_multi_aspect_ratio_inputs (
formatter = lambda vid_prompt : f " <|im_start|>user \n { vid_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
) ,
limit_mm_per_prompt = { " image " : 4 } ,
)
] ,
2025-11-12 19:38:13 +00:00
marks = [
pytest . mark . skipif (
Version ( TRANSFORMERS_VERSION ) == Version ( " 4.57.1 " ) ,
reason = " This model is broken in Transformers v4.57.1 " ,
)
] ,
2024-10-30 10:32:17 -06:00
) ,
2025-03-21 10:18:04 +08:00
# regression test for https://github.com/vllm-project/vllm/issues/15122
" qwen2_5_vl-windows-attention " : VLMTestInfo (
models = [ " Qwen/Qwen2.5-VL-3B-Instruct " ] ,
test_type = VLMTestType . CUSTOM_INPUTS ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
2025-09-17 20:19:15 +08:00
auto_cls = AutoModelForImageTextToText ,
2025-03-21 10:18:04 +08:00
vllm_output_post_proc = model_utils . qwen2_vllm_to_hf_output ,
2025-10-05 17:18:11 +01:00
custom_test_opts = [
CustomTestOptions (
inputs = custom_inputs . windows_attention_image_qwen2_5_vl ( ) ,
limit_mm_per_prompt = { " image " : 1 } ,
)
] ,
2025-03-21 10:18:04 +08:00
) ,
2024-10-30 10:32:17 -06:00
}
2024-12-12 06:18:16 +08:00
def _mark_splits (
test_settings : dict [ str , VLMTestInfo ] ,
* ,
num_groups : int ,
) - > dict [ str , VLMTestInfo ] :
name_by_test_info_id = { id ( v ) : k for k , v in test_settings . items ( ) }
test_infos_by_model = defaultdict [ str , list [ VLMTestInfo ] ] ( list )
for info in test_settings . values ( ) :
for model in info . models :
test_infos_by_model [ model ] . append ( info )
models = sorted ( test_infos_by_model . keys ( ) )
split_size = math . ceil ( len ( models ) / num_groups )
new_test_settings = dict [ str , VLMTestInfo ] ( )
for i in range ( num_groups ) :
models_in_group = models [ i * split_size : ( i + 1 ) * split_size ]
for model in models_in_group :
for info in test_infos_by_model [ model ] :
new_marks = ( info . marks or [ ] ) + [ pytest . mark . split ( group = i ) ]
new_info = info . _replace ( marks = new_marks )
new_test_settings [ name_by_test_info_id [ id ( info ) ] ] = new_info
missing_keys = test_settings . keys ( ) - new_test_settings . keys ( )
assert not missing_keys , f " Missing keys: { missing_keys } "
return new_test_settings
VLM_TEST_SETTINGS = _mark_splits ( VLM_TEST_SETTINGS , num_groups = 2 )
2024-10-30 10:32:17 -06:00
### Test wrappers
# Wrappers around the core test running func for:
# - single image
# - multi-image
# - image embeddings
# - video
2025-05-16 17:18:08 +08:00
# - audio
2024-10-30 10:32:17 -06:00
# - custom inputs
2025-01-28 00:23:08 +00:00
@pytest.mark.parametrize (
" model_type,test_case " ,
get_parametrized_options (
VLM_TEST_SETTINGS ,
test_type = VLMTestType . IMAGE ,
2025-03-17 19:33:35 +08:00
create_new_process_for_each_test = False ,
2025-10-05 15:06:22 +01:00
) ,
2025-01-28 00:23:08 +00:00
)
2025-09-21 01:50:58 +08:00
def test_single_image_models (
tmp_path : PosixPath ,
model_type : str ,
test_case : ExpandableVLMTestArgs ,
hf_runner : type [ HfRunner ] ,
vllm_runner : type [ VllmRunner ] ,
image_assets : ImageTestAssets ,
) :
2024-10-30 10:32:17 -06:00
model_test_info = VLM_TEST_SETTINGS [ model_type ]
runners . run_single_image_test (
tmp_path = tmp_path ,
model_test_info = model_test_info ,
test_case = test_case ,
hf_runner = hf_runner ,
vllm_runner = vllm_runner ,
image_assets = image_assets ,
)
2025-01-28 00:23:08 +00:00
@pytest.mark.parametrize (
" model_type,test_case " ,
get_parametrized_options (
VLM_TEST_SETTINGS ,
test_type = VLMTestType . MULTI_IMAGE ,
2025-03-17 19:33:35 +08:00
create_new_process_for_each_test = False ,
2025-10-05 15:06:22 +01:00
) ,
2025-01-28 00:23:08 +00:00
)
2025-09-21 01:50:58 +08:00
def test_multi_image_models (
tmp_path : PosixPath ,
model_type : str ,
test_case : ExpandableVLMTestArgs ,
hf_runner : type [ HfRunner ] ,
vllm_runner : type [ VllmRunner ] ,
image_assets : ImageTestAssets ,
) :
2024-10-30 10:32:17 -06:00
model_test_info = VLM_TEST_SETTINGS [ model_type ]
runners . run_multi_image_test (
tmp_path = tmp_path ,
model_test_info = model_test_info ,
test_case = test_case ,
hf_runner = hf_runner ,
vllm_runner = vllm_runner ,
image_assets = image_assets ,
)
2025-01-28 00:23:08 +00:00
@pytest.mark.parametrize (
" model_type,test_case " ,
get_parametrized_options (
VLM_TEST_SETTINGS ,
test_type = VLMTestType . EMBEDDING ,
2025-03-17 19:33:35 +08:00
create_new_process_for_each_test = False ,
2025-10-05 15:06:22 +01:00
) ,
2025-01-28 00:23:08 +00:00
)
2025-09-21 01:50:58 +08:00
def test_image_embedding_models (
model_type : str ,
test_case : ExpandableVLMTestArgs ,
hf_runner : type [ HfRunner ] ,
vllm_runner : type [ VllmRunner ] ,
image_assets : ImageTestAssets ,
) :
2024-10-30 10:32:17 -06:00
model_test_info = VLM_TEST_SETTINGS [ model_type ]
runners . run_embedding_test (
model_test_info = model_test_info ,
test_case = test_case ,
hf_runner = hf_runner ,
vllm_runner = vllm_runner ,
image_assets = image_assets ,
)
2025-01-28 00:23:08 +00:00
@pytest.mark.parametrize (
" model_type,test_case " ,
get_parametrized_options (
VLM_TEST_SETTINGS ,
test_type = VLMTestType . VIDEO ,
2025-03-17 19:33:35 +08:00
create_new_process_for_each_test = False ,
2025-10-05 15:06:22 +01:00
) ,
2025-01-28 00:23:08 +00:00
)
2025-09-21 01:50:58 +08:00
def test_video_models (
model_type : str ,
test_case : ExpandableVLMTestArgs ,
hf_runner : type [ HfRunner ] ,
vllm_runner : type [ VllmRunner ] ,
video_assets : VideoTestAssets ,
) :
2024-10-30 10:32:17 -06:00
model_test_info = VLM_TEST_SETTINGS [ model_type ]
runners . run_video_test (
model_test_info = model_test_info ,
test_case = test_case ,
hf_runner = hf_runner ,
vllm_runner = vllm_runner ,
video_assets = video_assets ,
)
2025-05-16 17:18:08 +08:00
@pytest.mark.parametrize (
" model_type,test_case " ,
get_parametrized_options (
VLM_TEST_SETTINGS ,
test_type = VLMTestType . AUDIO ,
create_new_process_for_each_test = False ,
2025-10-05 15:06:22 +01:00
) ,
2025-05-16 17:18:08 +08:00
)
2025-09-21 01:50:58 +08:00
def test_audio_models (
model_type : str ,
test_case : ExpandableVLMTestArgs ,
hf_runner : type [ HfRunner ] ,
vllm_runner : type [ VllmRunner ] ,
audio_assets : AudioTestAssets ,
) :
2025-05-16 17:18:08 +08:00
model_test_info = VLM_TEST_SETTINGS [ model_type ]
runners . run_audio_test (
model_test_info = model_test_info ,
test_case = test_case ,
hf_runner = hf_runner ,
vllm_runner = vllm_runner ,
audio_assets = audio_assets ,
)
2025-01-28 00:23:08 +00:00
@pytest.mark.parametrize (
" model_type,test_case " ,
get_parametrized_options (
VLM_TEST_SETTINGS ,
test_type = VLMTestType . CUSTOM_INPUTS ,
2025-03-17 19:33:35 +08:00
create_new_process_for_each_test = False ,
2025-10-05 15:06:22 +01:00
) ,
2025-01-28 00:23:08 +00:00
)
2024-10-30 10:32:17 -06:00
def test_custom_inputs_models (
model_type : str ,
test_case : ExpandableVLMTestArgs ,
2025-03-03 01:34:51 +00:00
hf_runner : type [ HfRunner ] ,
vllm_runner : type [ VllmRunner ] ,
2024-10-30 10:32:17 -06:00
) :
model_test_info = VLM_TEST_SETTINGS [ model_type ]
runners . run_custom_inputs_test (
model_test_info = model_test_info ,
test_case = test_case ,
hf_runner = hf_runner ,
vllm_runner = vllm_runner ,
)
#### Tests filtering for things running each test as a new process
2025-01-28 00:23:08 +00:00
@pytest.mark.parametrize (
" model_type,test_case " ,
get_parametrized_options (
VLM_TEST_SETTINGS ,
test_type = VLMTestType . IMAGE ,
2025-03-17 19:33:35 +08:00
create_new_process_for_each_test = True ,
2025-10-05 15:06:22 +01:00
) ,
2025-01-28 00:23:08 +00:00
)
2025-03-17 19:33:35 +08:00
@create_new_process_for_each_test ( )
2025-09-21 01:50:58 +08:00
def test_single_image_models_heavy (
tmp_path : PosixPath ,
model_type : str ,
test_case : ExpandableVLMTestArgs ,
hf_runner : type [ HfRunner ] ,
vllm_runner : type [ VllmRunner ] ,
image_assets : ImageTestAssets ,
) :
2024-10-30 10:32:17 -06:00
model_test_info = VLM_TEST_SETTINGS [ model_type ]
runners . run_single_image_test (
tmp_path = tmp_path ,
model_test_info = model_test_info ,
test_case = test_case ,
hf_runner = hf_runner ,
vllm_runner = vllm_runner ,
image_assets = image_assets ,
)
2025-01-28 00:23:08 +00:00
@pytest.mark.parametrize (
" model_type,test_case " ,
get_parametrized_options (
VLM_TEST_SETTINGS ,
test_type = VLMTestType . MULTI_IMAGE ,
2025-03-17 19:33:35 +08:00
create_new_process_for_each_test = True ,
2025-10-05 15:06:22 +01:00
) ,
2025-01-28 00:23:08 +00:00
)
2025-03-17 19:33:35 +08:00
@create_new_process_for_each_test ( )
2025-09-21 01:50:58 +08:00
def test_multi_image_models_heavy (
tmp_path : PosixPath ,
model_type : str ,
test_case : ExpandableVLMTestArgs ,
hf_runner : type [ HfRunner ] ,
vllm_runner : type [ VllmRunner ] ,
image_assets : ImageTestAssets ,
) :
2024-10-30 10:32:17 -06:00
model_test_info = VLM_TEST_SETTINGS [ model_type ]
runners . run_multi_image_test (
tmp_path = tmp_path ,
model_test_info = model_test_info ,
test_case = test_case ,
hf_runner = hf_runner ,
vllm_runner = vllm_runner ,
image_assets = image_assets ,
)
2025-01-28 00:23:08 +00:00
@pytest.mark.parametrize (
" model_type,test_case " ,
get_parametrized_options (
VLM_TEST_SETTINGS ,
test_type = VLMTestType . EMBEDDING ,
2025-03-17 19:33:35 +08:00
create_new_process_for_each_test = True ,
2025-10-05 15:06:22 +01:00
) ,
2025-01-28 00:23:08 +00:00
)
2025-03-17 19:33:35 +08:00
@create_new_process_for_each_test ( )
2025-09-21 01:50:58 +08:00
def test_image_embedding_models_heavy (
model_type : str ,
test_case : ExpandableVLMTestArgs ,
hf_runner : type [ HfRunner ] ,
vllm_runner : type [ VllmRunner ] ,
image_assets : ImageTestAssets ,
) :
2024-10-30 10:32:17 -06:00
model_test_info = VLM_TEST_SETTINGS [ model_type ]
runners . run_embedding_test (
model_test_info = model_test_info ,
test_case = test_case ,
hf_runner = hf_runner ,
vllm_runner = vllm_runner ,
image_assets = image_assets ,
)
2025-01-28 00:23:08 +00:00
@pytest.mark.parametrize (
" model_type,test_case " ,
get_parametrized_options (
VLM_TEST_SETTINGS ,
test_type = VLMTestType . VIDEO ,
2025-03-17 19:33:35 +08:00
create_new_process_for_each_test = True ,
2025-10-05 15:06:22 +01:00
) ,
2025-01-28 00:23:08 +00:00
)
2025-09-21 01:50:58 +08:00
def test_video_models_heavy (
model_type : str ,
test_case : ExpandableVLMTestArgs ,
hf_runner : type [ HfRunner ] ,
vllm_runner : type [ VllmRunner ] ,
video_assets : VideoTestAssets ,
) :
2024-10-30 10:32:17 -06:00
model_test_info = VLM_TEST_SETTINGS [ model_type ]
runners . run_video_test (
model_test_info = model_test_info ,
test_case = test_case ,
hf_runner = hf_runner ,
vllm_runner = vllm_runner ,
video_assets = video_assets ,
)
2025-05-16 17:18:08 +08:00
@pytest.mark.parametrize (
" model_type,test_case " ,
get_parametrized_options (
VLM_TEST_SETTINGS ,
test_type = VLMTestType . AUDIO ,
create_new_process_for_each_test = True ,
2025-10-05 15:06:22 +01:00
) ,
2025-05-16 17:18:08 +08:00
)
2025-09-21 01:50:58 +08:00
def test_audio_models_heavy (
model_type : str ,
test_case : ExpandableVLMTestArgs ,
hf_runner : type [ HfRunner ] ,
vllm_runner : type [ VllmRunner ] ,
audio_assets : AudioTestAssets ,
) :
2025-05-16 17:18:08 +08:00
model_test_info = VLM_TEST_SETTINGS [ model_type ]
runners . run_audio_test (
model_test_info = model_test_info ,
test_case = test_case ,
hf_runner = hf_runner ,
vllm_runner = vllm_runner ,
audio_assets = audio_assets ,
)
2025-01-28 00:23:08 +00:00
@pytest.mark.parametrize (
" model_type,test_case " ,
get_parametrized_options (
VLM_TEST_SETTINGS ,
test_type = VLMTestType . CUSTOM_INPUTS ,
2025-03-17 19:33:35 +08:00
create_new_process_for_each_test = True ,
2025-10-05 15:06:22 +01:00
) ,
2025-01-28 00:23:08 +00:00
)
2025-03-17 19:33:35 +08:00
@create_new_process_for_each_test ( )
2024-10-30 10:32:17 -06:00
def test_custom_inputs_models_heavy (
model_type : str ,
test_case : ExpandableVLMTestArgs ,
2025-03-03 01:34:51 +00:00
hf_runner : type [ HfRunner ] ,
vllm_runner : type [ VllmRunner ] ,
2024-10-30 10:32:17 -06:00
) :
model_test_info = VLM_TEST_SETTINGS [ model_type ]
runners . run_custom_inputs_test (
model_test_info = model_test_info ,
test_case = test_case ,
hf_runner = hf_runner ,
vllm_runner = vllm_runner ,
)