@@ -130,14 +130,14 @@ VLM_TEST_SETTINGS = {
dtype = " bfloat16 " ,
marks = [
pytest . mark . skip ( reason = " vLLM does not support PrefixLM attention mask " )
] , # noqa: E501
] ,
) ,
" qwen2_5_vl " : VLMTestInfo (
models = [ " Qwen/Qwen2.5-VL-3B-Instruct " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE , VLMTestType . VIDEO ) ,
prompt_formatter = lambda img_prompt : f " <|im_start|>User \n { img_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
img_idx_to_prompt = lambda idx : " <|vision_start|><|image_pad|><|vision_end|> " , # noqa: E501
video_idx_to_prompt = lambda idx : " <|vision_start|><|video_pad|><|vision_end|> " , # noqa: E501
img_idx_to_prompt = lambda idx : " <|vision_start|><|image_pad|><|vision_end|> " ,
video_idx_to_prompt = lambda idx : " <|vision_start|><|video_pad|><|vision_end|> " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
auto_cls = AutoModelForImageTextToText ,
@@ -149,8 +149,8 @@ VLM_TEST_SETTINGS = {
models = [ " Qwen/Qwen2.5-Omni-3B " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE , VLMTestType . VIDEO ) ,
prompt_formatter = lambda img_prompt : f " <|im_start|>User \n { img_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
img_idx_to_prompt = lambda idx : " <|vision_bos|><|IMAGE|><|vision_eos|> " , # noqa: E501
video_idx_to_prompt = lambda idx : " <|vision_bos|><|VIDEO|><|vision_eos|> " , # noqa: E501
img_idx_to_prompt = lambda idx : " <|vision_bos|><|IMAGE|><|vision_eos|> " ,
video_idx_to_prompt = lambda idx : " <|vision_bos|><|VIDEO|><|vision_eos|> " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
num_logprobs = 6 if current_platform . is_cpu ( ) else 5 ,
@@ -181,7 +181,7 @@ VLM_TEST_SETTINGS = {
max_model_len = 16384 ,
hf_model_kwargs = model_utils . llava_onevision_hf_model_kwargs (
" llava-hf/llava-onevision-qwen2-0.5b-ov-hf "
) , # noqa: E501
) ,
auto_cls = AutoModelForImageTextToText ,
vllm_output_post_proc = model_utils . llava_onevision_vllm_to_hf_output ,
image_size_factors = [ ( 0.25 , 0.5 , 1.0 ) ] ,
@@ -213,7 +213,7 @@ VLM_TEST_SETTINGS = {
models = [ " Qwen/Qwen2.5-VL-3B-Instruct " ] ,
test_type = VLMTestType . IMAGE ,
prompt_formatter = lambda img_prompt : f " <|im_start|>User \n { img_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
img_idx_to_prompt = lambda idx : " <|vision_start|><|image_pad|><|vision_end|> " , # noqa: E501
img_idx_to_prompt = lambda idx : " <|vision_start|><|image_pad|><|vision_end|> " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
auto_cls = AutoModelForImageTextToText ,
@@ -237,10 +237,10 @@ VLM_TEST_SETTINGS = {
single_image_prompts = IMAGE_ASSETS . prompts (
{
" stop_sign " : " <vlm_image>Please describe the image shortly. " ,
" cherry_blossom " : " <vlm_image>Please infer the season with reason. " , # noqa: E501
" cherry_blossom " : " <vlm_image>Please infer the season with reason. " ,
}
) ,
multi_image_prompt = " <vlm_image><vlm_image>Describe the two images shortly. " , # noqa: E501
multi_image_prompt = " <vlm_image><vlm_image>Describe the two images shortly. " ,
stop_str = [ " <|im_end|> " ] ,
image_size_factors = [ ( 0.10 , 0.15 ) ] ,
max_tokens = 64 ,
@@ -252,11 +252,11 @@ VLM_TEST_SETTINGS = {
prompt_formatter = lambda img_prompt : f " <|START_OF_TURN_TOKEN|><|USER_TOKEN|> { img_prompt } <|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> " , # noqa: E501
single_image_prompts = IMAGE_ASSETS . prompts (
{
" stop_sign " : " <image>What ' s the content in the center of the image? " , # noqa: E501
" cherry_blossom " : " <image>What is the season? " , # noqa: E501
" stop_sign " : " <image>What ' s the content in the center of the image? " ,
" cherry_blossom " : " <image>What is the season? " ,
}
) ,
multi_image_prompt = " <image><image>Describe the two images in detail. " , # noqa: E501
multi_image_prompt = " <image><image>Describe the two images in detail. " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
auto_cls = AutoModelForImageTextToText ,
@@ -268,11 +268,11 @@ VLM_TEST_SETTINGS = {
prompt_formatter = lambda img_prompt : f " <|START_OF_TURN_TOKEN|><|USER_TOKEN|> { img_prompt } <|END_OF_TURN_TOKEN|><|START_OF_TURN_TOKEN|><|CHATBOT_TOKEN|> " , # noqa: E501
single_image_prompts = IMAGE_ASSETS . prompts (
{
" stop_sign " : " <image>What ' s the content in the center of the image? " , # noqa: E501
" cherry_blossom " : " <image>What is the season? " , # noqa: E501
" stop_sign " : " <image>What ' s the content in the center of the image? " ,
" cherry_blossom " : " <image>What is the season? " ,
}
) ,
multi_image_prompt = " <image><image>Describe the two images in detail. " , # noqa: E501
multi_image_prompt = " <image><image>Describe the two images in detail. " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
auto_cls = AutoModelForImageTextToText ,
@@ -311,14 +311,14 @@ VLM_TEST_SETTINGS = {
max_num_seqs = 2 ,
single_image_prompts = IMAGE_ASSETS . prompts (
{
" stop_sign " : " <image> \n What ' s the content in the center of the image? " , # noqa: E501
" stop_sign " : " <image> \n What ' s the content in the center of the image? " ,
" cherry_blossom " : " <image> \n Please infer the season with reason in details. " , # noqa: E501
}
) ,
multi_image_prompt = " image_1:<image> \n image_2:<image> \n Which image can we see the car and the tower? " , # noqa: E501
patch_hf_runner = model_utils . deepseekvl2_patch_hf_runner ,
hf_output_post_proc = model_utils . deepseekvl2_trunc_hf_output ,
stop_str = [ " <| end▁of▁sentence| > " , " <| begin▁of▁sentence| > " ] , # noqa: E501
stop_str = [ " <| end▁of▁sentence| > " , " <| begin▁of▁sentence| > " ] ,
image_size_factors = [ ( ) , ( 1.0 , ) , ( 1.0 , 1.0 , 1.0 ) , ( 0.1 , 0.5 , 1.0 ) ] ,
) ,
" fuyu " : VLMTestInfo (
@@ -342,7 +342,7 @@ VLM_TEST_SETTINGS = {
single_image_prompts = IMAGE_ASSETS . prompts (
{
" stop_sign " : " <start_of_image>What ' s the content in the center of the image? " , # noqa: E501
" cherry_blossom " : " <start_of_image>What is the season? " , # noqa: E501
" cherry_blossom " : " <start_of_image>What is the season? " ,
}
) ,
multi_image_prompt = " <start_of_image><start_of_image>Describe the two images in detail. " , # noqa: E501
@@ -356,7 +356,7 @@ VLM_TEST_SETTINGS = {
" glm4v " : VLMTestInfo (
models = [ " zai-org/glm-4v-9b " ] ,
test_type = VLMTestType . IMAGE ,
prompt_formatter = lambda img_prompt : f " <|user|> \n { img_prompt } <|assistant|> " , # noqa: E501
prompt_formatter = lambda img_prompt : f " <|user|> \n { img_prompt } <|assistant|> " ,
single_image_prompts = IMAGE_ASSETS . prompts (
{
" stop_sign " : " <|begin_of_image|><|endoftext|><|end_of_image|>What ' s the content in the center of the image? " , # noqa: E501
@@ -377,9 +377,9 @@ VLM_TEST_SETTINGS = {
" glm4_1v " : VLMTestInfo (
models = [ " zai-org/GLM-4.1V-9B-Thinking " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
prompt_formatter = lambda img_prompt : f " <|user|> \n { img_prompt } <|assistant|> " , # noqa: E501
img_idx_to_prompt = lambda idx : " <|begin_of_image|><|image|><|end_of_image|> " , # noqa: E501
video_idx_to_prompt = lambda idx : " <|begin_of_video|><|video|><|end_of_video|> " , # noqa: E501
prompt_formatter = lambda img_prompt : f " <|user|> \n { img_prompt } <|assistant|> " ,
img_idx_to_prompt = lambda idx : " <|begin_of_image|><|image|><|end_of_image|> " ,
video_idx_to_prompt = lambda idx : " <|begin_of_video|><|video|><|end_of_video|> " ,
max_model_len = 2048 ,
max_num_seqs = 2 ,
get_stop_token_ids = lambda tok : [ 151329 , 151336 , 151338 ] ,
@@ -410,10 +410,10 @@ VLM_TEST_SETTINGS = {
" h2oai/h2ovl-mississippi-2b " ,
] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
prompt_formatter = lambda img_prompt : f " <|prompt|> { img_prompt } <|end|><|answer|> " , # noqa: E501
prompt_formatter = lambda img_prompt : f " <|prompt|> { img_prompt } <|end|><|answer|> " ,
single_image_prompts = IMAGE_ASSETS . prompts (
{
" stop_sign " : " <image> \n What ' s the content in the center of the image? " , # noqa: E501
" stop_sign " : " <image> \n What ' s the content in the center of the image? " ,
" cherry_blossom " : " <image> \n What is the season? " ,
}
) ,
@@ -444,7 +444,7 @@ VLM_TEST_SETTINGS = {
prompt_formatter = lambda img_prompt : f " <|im_start|>User \n { img_prompt } <|im_end|> \n <|im_start|>Assistant \n " , # noqa: E501
single_image_prompts = IMAGE_ASSETS . prompts (
{
" stop_sign " : " <image> \n What ' s the content in the center of the image? " , # noqa: E501
" stop_sign " : " <image> \n What ' s the content in the center of the image? " ,
" cherry_blossom " : " <image> \n What is the season? " ,
}
) ,
@@ -529,7 +529,7 @@ VLM_TEST_SETTINGS = {
max_model_len = 16384 ,
hf_model_kwargs = model_utils . llava_onevision_hf_model_kwargs (
" llava-hf/llava-onevision-qwen2-0.5b-ov-hf "
) , # noqa: E501
) ,
auto_cls = AutoModelForImageTextToText ,
vllm_output_post_proc = model_utils . llava_onevision_vllm_to_hf_output ,
custom_test_opts = [
@@ -583,7 +583,7 @@ VLM_TEST_SETTINGS = {
max_num_seqs = 2 ,
get_stop_token_ids = lambda tok : tok . convert_tokens_to_ids (
[ " <|im_end|> " , " <|endoftext|> " ]
) , # noqa: E501
) ,
hf_output_post_proc = model_utils . minicpmv_trunc_hf_output ,
patch_hf_runner = model_utils . minicpmo_26_patch_hf_runner ,
# FIXME: https://huggingface.co/openbmb/MiniCPM-o-2_6/discussions/49
@@ -598,7 +598,7 @@ VLM_TEST_SETTINGS = {
max_num_seqs = 2 ,
get_stop_token_ids = lambda tok : tok . convert_tokens_to_ids (
[ " <|im_end|> " , " <|endoftext|> " ]
) , # noqa: E501
) ,
hf_output_post_proc = model_utils . minicpmv_trunc_hf_output ,
patch_hf_runner = model_utils . minicpmv_26_patch_hf_runner ,
) ,
@@ -627,7 +627,7 @@ VLM_TEST_SETTINGS = {
models = [ " AIDC-AI/Ovis1.6-Gemma2-9B " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
prompt_formatter = lambda img_prompt : f " <bos><start_of_turn>user \n { img_prompt } <end_of_turn> \n <start_of_turn>model \n " , # noqa: E501
img_idx_to_prompt = lambda idx : " <image> \n " , # noqa: E501
img_idx_to_prompt = lambda idx : " <image> \n " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
dtype = " half " ,
@@ -640,7 +640,7 @@ VLM_TEST_SETTINGS = {
models = [ " AIDC-AI/Ovis2-1B " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE ) ,
prompt_formatter = lambda img_prompt : f " <|im_start|>system \n You are a helpful assistant.<|im_end|> \n <|im_start|>user \n { img_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
img_idx_to_prompt = lambda idx : " <image> \n " , # noqa: E501
img_idx_to_prompt = lambda idx : " <image> \n " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
dtype = " half " ,
@@ -652,7 +652,7 @@ VLM_TEST_SETTINGS = {
models = [ " AIDC-AI/Ovis2.5-2B " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE , VLMTestType . VIDEO ) ,
prompt_formatter = lambda img_prompt : f " <|im_start|>system \n You are a helpful assistant.<|im_end|> \n <|im_start|>user \n { img_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
img_idx_to_prompt = lambda idx : " <image> \n " , # noqa: E501
img_idx_to_prompt = lambda idx : " <image> \n " ,
video_idx_to_prompt = lambda idx : " <video> \n " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
@@ -701,8 +701,8 @@ VLM_TEST_SETTINGS = {
models = [ " Qwen/Qwen2-VL-2B-Instruct " ] ,
test_type = ( VLMTestType . IMAGE , VLMTestType . MULTI_IMAGE , VLMTestType . VIDEO ) ,
prompt_formatter = lambda img_prompt : f " <|im_start|>User \n { img_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
img_idx_to_prompt = lambda idx : " <|vision_start|><|image_pad|><|vision_end|> " , # noqa: E501
video_idx_to_prompt = lambda idx : " <|vision_start|><|video_pad|><|vision_end|> " , # noqa: E501
img_idx_to_prompt = lambda idx : " <|vision_start|><|image_pad|><|vision_end|> " ,
video_idx_to_prompt = lambda idx : " <|vision_start|><|video_pad|><|vision_end|> " ,
multi_image_prompt = " Picture 1: <vlm_image> \n Picture 2: <vlm_image> \n Describe these two images with one paragraph respectively. " , # noqa: E501
max_model_len = 4096 ,
max_num_seqs = 2 ,
@@ -717,11 +717,11 @@ VLM_TEST_SETTINGS = {
prompt_formatter = lambda img_prompt : f " <| begin▁of▁sentence| ><| User| > \n { img_prompt } <| Assistant| ><think> \n " , # noqa: E501
single_image_prompts = IMAGE_ASSETS . prompts (
{
" stop_sign " : " <image> \n What ' s the content in the center of the image? " , # noqa: E501
" stop_sign " : " <image> \n What ' s the content in the center of the image? " ,
" cherry_blossom " : " <image> \n What is the season? " ,
}
) ,
multi_image_prompt = " <image> \n <image> \n Describe the two images in short. " , # noqa: E501
multi_image_prompt = " <image> \n <image> \n Describe the two images in short. " ,
max_model_len = 4096 ,
use_tokenizer_eos = True ,
patch_hf_runner = model_utils . skyworkr1v_patch_hf_runner ,
@@ -754,8 +754,8 @@ VLM_TEST_SETTINGS = {
VLMTestType . VIDEO ,
) ,
prompt_formatter = lambda img_prompt : f " <|im_start|>system \n You are a helpful assistant.<|im_end|> \n <|im_start|>user \n { img_prompt } <|im_end|> \n <|im_start|>assistant \n " , # noqa: E501
img_idx_to_prompt = lambda idx : " <|vision_start|><|image_pad|><|vision_end|> " , # noqa: E501
video_idx_to_prompt = lambda idx : " <|vision_start|><|video_pad|><|vision_end|> " , # noqa: E501
img_idx_to_prompt = lambda idx : " <|vision_start|><|image_pad|><|vision_end|> " ,
video_idx_to_prompt = lambda idx : " <|vision_start|><|video_pad|><|vision_end|> " ,
max_model_len = 4096 ,
max_num_seqs = 2 ,
auto_cls = AutoModelForImageTextToText ,
@@ -816,7 +816,7 @@ VLM_TEST_SETTINGS = {
auto_cls = AutoModelForImageTextToText ,
hf_model_kwargs = model_utils . llava_onevision_hf_model_kwargs (
" llava-hf/llava-onevision-qwen2-0.5b-ov-hf "
) , # noqa: E501
) ,
vllm_output_post_proc = model_utils . llava_onevision_vllm_to_hf_output ,
custom_test_opts = [
CustomTestOptions (