2025-12-15 11:13:32 +08:00
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Consolidated test for ViT attention backend functionality across multiple models .
This test validates that each multimodal model can successfully generate outputs
using different ViT attention backends . Tests are parametrized by model and backend .
"""
from typing import Any
import pytest
from transformers import AutoProcessor
2026-03-25 13:14:43 +00:00
from vllm import LLM , SamplingParams
2025-12-23 21:45:21 +08:00
from vllm . multimodal . utils import encode_image_url
2025-12-15 11:13:32 +08:00
from vllm . multimodal . video import sample_frames_from_video
from vllm . platforms import current_platform
2026-01-09 16:10:24 -05:00
from vllm . v1 . attention . backends . registry import AttentionBackendEnum
2025-12-15 11:13:32 +08:00
from . . . . utils import create_new_process_for_each_test
from . . . utils import dummy_hf_overrides
# Dots.OCR prompt from official repository
# https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/utils/prompts.py#L3
# ruff: noqa: E501
DOTS_OCR_PROMPT = """ Please output the layout information from the PDF image, including each layout element ' s bbox, its category, and the corresponding text content within the bbox.
1. Bbox format : [ x1 , y1 , x2 , y2 ]
2. Layout Categories : The possible categories are [ ' Caption ' , ' Footnote ' , ' Formula ' , ' List-item ' , ' Page-footer ' , ' Page-header ' , ' Picture ' , ' Section-header ' , ' Table ' , ' Text ' , ' Title ' ] .
3. Text Extraction & Formatting Rules :
- Picture : For the ' Picture ' category , the text field should be omitted .
- Formula : Format its text as LaTeX .
- Table : Format its text as HTML .
- All Others ( Text , Title , etc . ) : Format their text as Markdown .
4. Constraints :
- The output text must be the original text from the image , with no translation .
- All layout elements must be sorted according to human reading order .
5. Final Output : The entire output must be a single JSON object .
"""
VIDEO_PLACEHOLDER = " <|vision_start|><|video_pad|><|vision_end|> "
# Model configurations
MODEL_CONFIGS : dict [ str , dict [ str , Any ] ] = {
" dots_ocr " : {
" model_name " : " rednote-hilab/dots.ocr " ,
" interface " : " llm_chat " ,
" max_model_len " : 32768 ,
" max_num_seqs " : 1 ,
" limit_mm_per_prompt " : { " image " : 1 } ,
" sampling_params " : {
" temperature " : 0.1 ,
" max_tokens " : 16384 ,
" top_p " : 0.9 ,
" stop_token_ids " : None ,
} ,
" use_specific_image " : " stop_sign " ,
" prompt_builder " : " build_dots_ocr_prompt " ,
" output_validator " : lambda x : len ( x ) > 10 and " stop " in x . lower ( ) ,
} ,
" ernie45_vl " : {
" model_name " : " baidu/ERNIE-4.5-VL-28B-A3B-PT " ,
" interface " : " llm_generate " ,
" max_model_len " : 16384 ,
" max_num_seqs " : 2 ,
" sampling_params " : {
" temperature " : 0.0 ,
" max_tokens " : 256 ,
" stop_token_ids " : None ,
} ,
" use_processor " : True ,
" question " : " What is the content of each image? " ,
} ,
" glm4_1v " : {
" model_name " : " zai-org/GLM-4.1V-9B-Thinking " ,
" interface " : " llm_generate " ,
" max_model_len " : 32768 ,
" max_num_seqs " : 2 ,
" sampling_params " : {
" temperature " : 0.0 ,
" max_tokens " : 256 ,
" stop_token_ids " : None ,
} ,
" use_processor " : True ,
" question " : " What is the content of each image? " ,
} ,
2026-01-26 22:24:43 +08:00
" glm_ocr " : {
" model_name " : " zai-org/GLM-OCR " ,
" interface " : " llm_generate " ,
" max_model_len " : 131072 ,
" max_num_seqs " : 2 ,
" sampling_params " : {
" temperature " : 0.0 ,
" max_tokens " : 256 ,
" stop_token_ids " : None ,
} ,
" use_processor " : True ,
" question " : " Text Recognition: " ,
} ,
2025-12-15 11:13:32 +08:00
" keye_vl " : {
" model_name " : " Kwai-Keye/Keye-VL-8B-Preview " ,
" interface " : " llm_generate " ,
" max_model_len " : 8192 ,
" max_num_seqs " : 5 ,
" sampling_params " : {
" temperature " : 0.0 ,
" max_tokens " : 256 ,
" stop_token_ids " : None ,
} ,
" supported_backends " : {
AttentionBackendEnum . FLASH_ATTN ,
AttentionBackendEnum . ROCM_AITER_FA ,
} ,
" use_processor " : True ,
" question " : " What is the content of each image? " ,
} ,
" ovis2_5 " : {
" model_name " : " AIDC-AI/Ovis2.5-2B " ,
" interface " : " llm_generate " ,
" max_model_len " : 8192 ,
" max_num_seqs " : 2 ,
" sampling_params " : {
" temperature " : 0.0 ,
" max_tokens " : 256 ,
" stop_token_ids " : None ,
} ,
" prompt_builder " : " build_ovis_prompt " ,
" question " : " What is the content of each image? " ,
} ,
" qwen2_5_vl " : {
" model_name " : " Qwen/Qwen2.5-VL-3B-Instruct " ,
" interface " : " vllm_runner " ,
" media_type " : " video " ,
" max_model_len " : 4000 ,
" max_num_seqs " : 1 ,
" limit_mm_per_prompt " : { " video " : 1 } ,
" sampling_params " : {
" max_tokens " : 128 ,
} ,
" runner_kwargs " : {
" runner " : " generate " ,
" dtype " : " bfloat16 " ,
} ,
" video_params " : {
" num_frames " : 16 ,
" pruning_rates " : [ 0.0 , 0.75 ] ,
} ,
} ,
" qwen2_5_omni " : {
" model_name " : " Qwen/Qwen2.5-Omni-3B " ,
" interface " : " llm_generate " ,
" max_model_len " : 32768 ,
" max_num_seqs " : 2 ,
" limit_mm_per_prompt " : { " image " : 3 , " video " : 3 , " audio " : 3 } ,
" sampling_params " : {
" temperature " : 0.6 ,
" top_p " : 0.95 ,
" top_k " : 20 ,
" max_tokens " : 16384 ,
} ,
" use_processor " : True ,
" question " : " What is the content of each image? " ,
} ,
" qwen3_omni " : {
" model_name " : " Qwen/Qwen3-Omni-30B-A3B-Instruct " ,
" interface " : " llm_generate " ,
" max_model_len " : 32768 ,
" max_num_seqs " : 2 ,
" limit_mm_per_prompt " : { " image " : 3 , " video " : 3 , " audio " : 3 } ,
" sampling_params " : {
" temperature " : 0.6 ,
" top_p " : 0.95 ,
" top_k " : 20 ,
" max_tokens " : 16384 ,
} ,
" use_processor " : True ,
" question " : " What is the content of each image? " ,
} ,
}
# Prompt builder functions
def build_dots_ocr_prompt ( images , config ) :
""" Build Dots.OCR specific prompt with OCR instructions. """
# Use only stop_sign image for Dots.OCR
image = images [ 0 ] # Already filtered to stop_sign
2025-12-23 21:45:21 +08:00
image_url = encode_image_url ( image )
2025-12-15 11:13:32 +08:00
placeholders = [ { " type " : " image_url " , " image_url " : { " url " : image_url } } ]
messages = [
{
" role " : " user " ,
" content " : [
* placeholders ,
{
" type " : " text " ,
" text " : f " <|img|><|imgpad|><|endofimg|> { DOTS_OCR_PROMPT } " ,
} ,
] ,
} ,
]
return messages
def build_processor_prompt ( images , config ) :
""" Build prompt using AutoProcessor.apply_chat_template(). """
processor = AutoProcessor . from_pretrained (
config [ " model_name " ] , trust_remote_code = True
)
2025-12-23 21:45:21 +08:00
image_urls = [ encode_image_url ( img ) for img in images ]
2025-12-15 11:13:32 +08:00
placeholders = [ { " type " : " image " , " image " : url } for url in image_urls ]
messages = [
{
" role " : " user " ,
" content " : [
* placeholders ,
{ " type " : " text " , " text " : config [ " question " ] } ,
] ,
} ,
]
return processor . apply_chat_template (
messages , tokenize = False , add_generation_prompt = True
)
def build_ovis_prompt ( images , config ) :
""" Build Ovis2.5 specific prompt with custom format. """
2025-12-23 21:45:21 +08:00
image_urls = [ encode_image_url ( img ) for img in images ]
2025-12-15 11:13:32 +08:00
placeholders = " \n " . join (
f " Image- { i } : <image> \n " for i , _ in enumerate ( image_urls , start = 1 )
)
return (
f " <|im_start|>user \n \n { placeholders } \n { config [ ' question ' ] } <|im_end|> \n "
" <|im_start|>assistant \n "
)
def build_qwen2_5_video_prompt ( ) :
""" Build Qwen2.5-VL video prompt with EVS placeholder. """
return (
f " <|im_start|>system \n You are a helpful assistant.<|im_end|> \n "
f " <|im_start|>user \n { VIDEO_PLACEHOLDER } "
" Describe this video with a short sentence (no more than 20 words) "
" <|im_end|><|im_start|>assistant \n "
)
# Handler functions
def run_llm_generate_test ( config , mm_encoder_attn_backend , image_assets ) :
""" Standard LLM.generate() interface handler. """
images = [ asset . pil_image for asset in image_assets ]
# Build prompt
if config . get ( " use_processor " ) :
prompt = build_processor_prompt ( images , config )
else :
prompt_builder_name = config . get ( " prompt_builder " , " build_ovis_prompt " )
prompt_builder = globals ( ) [ prompt_builder_name ]
prompt = prompt_builder ( images , config )
# Determine limit_mm_per_prompt
limit_mm_per_prompt = config . get ( " limit_mm_per_prompt " , { " image " : len ( images ) } )
# Create engine
2026-03-25 13:14:43 +00:00
llm = LLM (
2025-12-15 11:13:32 +08:00
model = config [ " model_name " ] ,
trust_remote_code = True ,
max_model_len = config [ " max_model_len " ] ,
max_num_seqs = config [ " max_num_seqs " ] ,
limit_mm_per_prompt = limit_mm_per_prompt ,
mm_encoder_attn_backend = mm_encoder_attn_backend ,
hf_overrides = dummy_hf_overrides ,
load_format = " dummy " ,
2026-03-25 13:14:43 +00:00
seed = 42 ,
2025-12-15 11:13:32 +08:00
)
# Generate
sampling_params = SamplingParams ( * * config [ " sampling_params " ] )
outputs = llm . generate (
{
" prompt " : prompt ,
" multi_modal_data " : { " image " : images } ,
} ,
sampling_params = sampling_params ,
)
# Validate
for o in outputs :
generated_text = o . outputs [ 0 ] . text
validator = config . get ( " output_validator " , lambda x : len ( x ) > 10 )
assert validator ( generated_text ) , (
f " Validation failed for { config [ ' model_name ' ] } : { generated_text } "
)
def run_llm_chat_test ( config , mm_encoder_attn_backend , image_assets ) :
""" LLM.chat() interface handler for Dots.OCR. """
# Filter to stop_sign image only
stop_sign_image = [
asset . pil_image for asset in image_assets if asset . name == " stop_sign "
] [ 0 ]
# Build messages
messages = build_dots_ocr_prompt ( [ stop_sign_image ] , config )
# Create engine
2026-03-25 13:14:43 +00:00
llm = LLM (
2025-12-15 11:13:32 +08:00
model = config [ " model_name " ] ,
trust_remote_code = True ,
max_model_len = config [ " max_model_len " ] ,
max_num_seqs = config [ " max_num_seqs " ] ,
limit_mm_per_prompt = config [ " limit_mm_per_prompt " ] ,
mm_encoder_attn_backend = mm_encoder_attn_backend ,
hf_overrides = dummy_hf_overrides ,
load_format = " dummy " ,
2026-03-25 13:14:43 +00:00
seed = 42 ,
2025-12-15 11:13:32 +08:00
)
# Generate using chat
sampling_params = SamplingParams ( * * config [ " sampling_params " ] )
outputs = llm . chat ( messages = messages , sampling_params = sampling_params )
# Validate
for o in outputs :
generated_text = o . outputs [ 0 ] . text
validator = config . get ( " output_validator " , lambda x : len ( x ) > 10 )
assert validator ( generated_text ) , (
f " Validation failed for { config [ ' model_name ' ] } : { generated_text } "
)
def run_video_test ( config , mm_encoder_attn_backend , video_assets , vllm_runner ) :
""" Video test with EVS (Efficient Video Sampling) handler. """
for pruning_rate in config [ " video_params " ] [ " pruning_rates " ] :
num_frames = config [ " video_params " ] [ " num_frames " ]
# Sample frames from video
sampled_vids = [
sample_frames_from_video ( asset . np_ndarrays , num_frames )
for asset in video_assets
]
# Build prompt and prepare video
prompt = build_qwen2_5_video_prompt ( )
prompts = [ prompt ]
videos = [ sampled_vids [ 0 ] ]
# Run with vllm_runner context manager
with vllm_runner (
config [ " model_name " ] ,
max_model_len = config [ " max_model_len " ] ,
max_num_seqs = config [ " max_num_seqs " ] ,
limit_mm_per_prompt = config [ " limit_mm_per_prompt " ] ,
tensor_parallel_size = 1 ,
video_pruning_rate = pruning_rate ,
mm_encoder_attn_backend = mm_encoder_attn_backend ,
hf_overrides = dummy_hf_overrides ,
load_format = " dummy " ,
* * config [ " runner_kwargs " ] ,
) as vllm_model :
outputs = vllm_model . generate_greedy (
prompts ,
config [ " sampling_params " ] [ " max_tokens " ] ,
videos = videos ,
)
# Validate output
assert len ( outputs ) == 1 , f " Expected 1 output, got { len ( outputs ) } "
output_ids , output_text = outputs [ 0 ]
assert len ( output_ids ) > 0 , " Generated no output IDs "
assert len ( output_text ) > 0 , " Generated empty text "
assert isinstance ( output_text , str ) , (
f " Output is not string: { type ( output_text ) } "
)
# Main test function
@pytest.mark.parametrize ( " model_key " , list ( MODEL_CONFIGS . keys ( ) ) )
@pytest.mark.parametrize (
" mm_encoder_attn_backend " ,
[ None ] + current_platform . get_supported_vit_attn_backends ( ) ,
)
2025-12-16 22:45:25 +08:00
@pytest.mark.skip ( reason = " Broken test due to memory segmentation fault " )
2025-12-15 11:13:32 +08:00
@create_new_process_for_each_test ( )
def test_vit_backend_functionality (
model_key : str ,
mm_encoder_attn_backend : AttentionBackendEnum | None ,
image_assets ,
video_assets ,
vllm_runner ,
request ,
) :
""" Test ViT attention backend functionality for multimodal models.
This test validates that each model can successfully generate outputs
using different ViT attention backends . The test :
1. Filters unsupported backends per model
2. Applies appropriate GPU marks
3. Routes to the correct test handler based on interface
4. Validates output meets minimum requirements
"""
config = MODEL_CONFIGS [ model_key ]
# Step 1: Backend filtering
if (
" supported_backends " in config
and mm_encoder_attn_backend is not None
and mm_encoder_attn_backend not in config [ " supported_backends " ]
) :
pytest . skip (
f " { model_key } does not support { mm_encoder_attn_backend } backend now. "
)
# Step 2: Apply GPU marks dynamically
if " gpu_marks " in config :
for mark in config [ " gpu_marks " ] :
request . applymarker ( mark )
# Step 3: Route to appropriate handler
if config . get ( " media_type " ) == " video " :
run_video_test ( config , mm_encoder_attn_backend , video_assets , vllm_runner )
elif config [ " interface " ] == " llm_chat " :
run_llm_chat_test ( config , mm_encoder_attn_backend , image_assets )
elif config [ " interface " ] == " llm_generate " :
run_llm_generate_test ( config , mm_encoder_attn_backend , image_assets )
else :
raise ValueError ( f " Unknown interface: { config [ ' interface ' ] } " )