tests/models/multimodal/generation/test_vit_backend_functionality.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""
Consolidated test for ViT attention backend functionality across multiple models.

This test validates that each multimodal model can successfully generate outputs
using different ViT attention backends. Tests are parametrized by model and backend.
"""

from typing import Any

import pytest
from transformers import AutoProcessor

from vllm import LLM, SamplingParams
from vllm.multimodal.utils import encode_image_url
from vllm.multimodal.video import sample_frames_from_video
from vllm.platforms import current_platform
from vllm.v1.attention.backends.registry import AttentionBackendEnum

from ....utils import create_new_process_for_each_test
from ...utils import dummy_hf_overrides

# Dots.OCR prompt from official repository
# https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/utils/prompts.py#L3
# ruff: noqa: E501
DOTS_OCR_PROMPT = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.

1. Bbox format: [x1, y1, x2, y2]

2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].

3. Text Extraction & Formatting Rules:
    - Picture: For the 'Picture' category, the text field should be omitted.
    - Formula: Format its text as LaTeX.
    - Table: Format its text as HTML.
    - All Others (Text, Title, etc.): Format their text as Markdown.

4. Constraints:
    - The output text must be the original text from the image, with no translation.
    - All layout elements must be sorted according to human reading order.

5. Final Output: The entire output must be a single JSON object.
"""

VIDEO_PLACEHOLDER = "<|vision_start|><|video_pad|><|vision_end|>"


# Model configurations
MODEL_CONFIGS: dict[str, dict[str, Any]] = {
    "dots_ocr": {
        "model_name": "rednote-hilab/dots.ocr",
        "interface": "llm_chat",
        "max_model_len": 32768,
        "max_num_seqs": 1,
        "limit_mm_per_prompt": {"image": 1},
        "sampling_params": {
            "temperature": 0.1,
            "max_tokens": 16384,
            "top_p": 0.9,
            "stop_token_ids": None,
        },
        "use_specific_image": "stop_sign",
        "prompt_builder": "build_dots_ocr_prompt",
        "output_validator": lambda x: len(x) > 10 and "stop" in x.lower(),
    },
    "ernie45_vl": {
        "model_name": "baidu/ERNIE-4.5-VL-28B-A3B-PT",
        "interface": "llm_generate",
        "max_model_len": 16384,
        "max_num_seqs": 2,
        "sampling_params": {
            "temperature": 0.0,
            "max_tokens": 256,
            "stop_token_ids": None,
        },
        "use_processor": True,
        "question": "What is the content of each image?",
    },
    "glm4_1v": {
        "model_name": "zai-org/GLM-4.1V-9B-Thinking",
        "interface": "llm_generate",
        "max_model_len": 32768,
        "max_num_seqs": 2,
        "sampling_params": {
            "temperature": 0.0,
            "max_tokens": 256,
            "stop_token_ids": None,
        },
        "use_processor": True,
        "question": "What is the content of each image?",
    },
    "glm_ocr": {
        "model_name": "zai-org/GLM-OCR",
        "interface": "llm_generate",
        "max_model_len": 131072,
        "max_num_seqs": 2,
        "sampling_params": {
            "temperature": 0.0,
            "max_tokens": 256,
            "stop_token_ids": None,
        },
        "use_processor": True,
        "question": "Text Recognition:",
    },
    "keye_vl": {
        "model_name": "Kwai-Keye/Keye-VL-8B-Preview",
        "interface": "llm_generate",
        "max_model_len": 8192,
        "max_num_seqs": 5,
        "sampling_params": {
            "temperature": 0.0,
            "max_tokens": 256,
            "stop_token_ids": None,
        },
        "supported_backends": {
            AttentionBackendEnum.FLASH_ATTN,
            AttentionBackendEnum.ROCM_AITER_FA,
        },
        "use_processor": True,
        "question": "What is the content of each image?",
    },
    "ovis2_5": {
        "model_name": "AIDC-AI/Ovis2.5-2B",
        "interface": "llm_generate",
        "max_model_len": 8192,
        "max_num_seqs": 2,
        "sampling_params": {
            "temperature": 0.0,
            "max_tokens": 256,
            "stop_token_ids": None,
        },
        "prompt_builder": "build_ovis_prompt",
        "question": "What is the content of each image?",
    },
    "qwen2_5_vl": {
        "model_name": "Qwen/Qwen2.5-VL-3B-Instruct",
        "interface": "vllm_runner",
        "media_type": "video",
        "max_model_len": 4000,
        "max_num_seqs": 1,
        "limit_mm_per_prompt": {"video": 1},
        "sampling_params": {
            "max_tokens": 128,
        },
        "runner_kwargs": {
            "runner": "generate",
            "dtype": "bfloat16",
        },
        "video_params": {
            "num_frames": 16,
            "pruning_rates": [0.0, 0.75],
        },
    },
    "qwen2_5_omni": {
        "model_name": "Qwen/Qwen2.5-Omni-3B",
        "interface": "llm_generate",
        "max_model_len": 32768,
        "max_num_seqs": 2,
        "limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
        "sampling_params": {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 20,
            "max_tokens": 16384,
        },
        "use_processor": True,
        "question": "What is the content of each image?",
    },
    "qwen3_omni": {
        "model_name": "Qwen/Qwen3-Omni-30B-A3B-Instruct",
        "interface": "llm_generate",
        "max_model_len": 32768,
        "max_num_seqs": 2,
        "limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},
        "sampling_params": {
            "temperature": 0.6,
            "top_p": 0.95,
            "top_k": 20,
            "max_tokens": 16384,
        },
        "use_processor": True,
        "question": "What is the content of each image?",
    },
}


# Prompt builder functions
def build_dots_ocr_prompt(images, config):
    """Build Dots.OCR specific prompt with OCR instructions."""
    # Use only stop_sign image for Dots.OCR
    image = images[0]  # Already filtered to stop_sign
    image_url = encode_image_url(image)

    placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {
                    "type": "text",
                    "text": f"<|img|><|imgpad|><|endofimg|>{DOTS_OCR_PROMPT}",
                },
            ],
        },
    ]

    return messages


def build_processor_prompt(images, config):
    """Build prompt using AutoProcessor.apply_chat_template()."""
    processor = AutoProcessor.from_pretrained(
        config["model_name"], trust_remote_code=True
    )

    image_urls = [encode_image_url(img) for img in images]
    placeholders = [{"type": "image", "image": url} for url in image_urls]
    messages = [
        {
            "role": "user",
            "content": [
                *placeholders,
                {"type": "text", "text": config["question"]},
            ],
        },
    ]

    return processor.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )


def build_ovis_prompt(images, config):
    """Build Ovis2.5 specific prompt with custom format."""
    image_urls = [encode_image_url(img) for img in images]

    placeholders = "\n".join(
        f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)
    )

    return (
        f"<|im_start|>user\n\n{placeholders}\n{config['question']}<|im_end|>\n"
        "<|im_start|>assistant\n"
    )


def build_qwen2_5_video_prompt():
    """Build Qwen2.5-VL video prompt with EVS placeholder."""
    return (
        f"<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n"
        f"<|im_start|>user\n{VIDEO_PLACEHOLDER}"
        "Describe this video with a short sentence (no more than 20 words)"
        "<|im_end|><|im_start|>assistant\n"
    )


# Handler functions
def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):
    """Standard LLM.generate() interface handler."""
    images = [asset.pil_image for asset in image_assets]

    # Build prompt
    if config.get("use_processor"):
        prompt = build_processor_prompt(images, config)
    else:
        prompt_builder_name = config.get("prompt_builder", "build_ovis_prompt")
        prompt_builder = globals()[prompt_builder_name]
        prompt = prompt_builder(images, config)

    # Determine limit_mm_per_prompt
    limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})

    # Create engine
    llm = LLM(
        model=config["model_name"],
        trust_remote_code=True,
        max_model_len=config["max_model_len"],
        max_num_seqs=config["max_num_seqs"],
        limit_mm_per_prompt=limit_mm_per_prompt,
        mm_encoder_attn_backend=mm_encoder_attn_backend,
        hf_overrides=dummy_hf_overrides,
        load_format="dummy",
        seed=42,
    )

    # Generate
    sampling_params = SamplingParams(**config["sampling_params"])
    outputs = llm.generate(
        {
            "prompt": prompt,
            "multi_modal_data": {"image": images},
        },
        sampling_params=sampling_params,
    )

    # Validate
    for o in outputs:
        generated_text = o.outputs[0].text
        validator = config.get("output_validator", lambda x: len(x) > 10)
        assert validator(generated_text), (
            f"Validation failed for {config['model_name']}: {generated_text}"
        )


def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):
    """LLM.chat() interface handler for Dots.OCR."""
    # Filter to stop_sign image only
    stop_sign_image = [
        asset.pil_image for asset in image_assets if asset.name == "stop_sign"
    ][0]

    # Build messages
    messages = build_dots_ocr_prompt([stop_sign_image], config)

    # Create engine
    llm = LLM(
        model=config["model_name"],
        trust_remote_code=True,
        max_model_len=config["max_model_len"],
        max_num_seqs=config["max_num_seqs"],
        limit_mm_per_prompt=config["limit_mm_per_prompt"],
        mm_encoder_attn_backend=mm_encoder_attn_backend,
        hf_overrides=dummy_hf_overrides,
        load_format="dummy",
        seed=42,
    )

    # Generate using chat
    sampling_params = SamplingParams(**config["sampling_params"])
    outputs = llm.chat(messages=messages, sampling_params=sampling_params)

    # Validate
    for o in outputs:
        generated_text = o.outputs[0].text
        validator = config.get("output_validator", lambda x: len(x) > 10)
        assert validator(generated_text), (
            f"Validation failed for {config['model_name']}: {generated_text}"
        )


def run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner):
    """Video test with EVS (Efficient Video Sampling) handler."""
    for pruning_rate in config["video_params"]["pruning_rates"]:
        num_frames = config["video_params"]["num_frames"]

        # Sample frames from video
        sampled_vids = [
            sample_frames_from_video(asset.np_ndarrays, num_frames)
            for asset in video_assets
        ]

        # Build prompt and prepare video
        prompt = build_qwen2_5_video_prompt()
        prompts = [prompt]
        videos = [sampled_vids[0]]

        # Run with vllm_runner context manager
        with vllm_runner(
            config["model_name"],
            max_model_len=config["max_model_len"],
            max_num_seqs=config["max_num_seqs"],
            limit_mm_per_prompt=config["limit_mm_per_prompt"],
            tensor_parallel_size=1,
            video_pruning_rate=pruning_rate,
            mm_encoder_attn_backend=mm_encoder_attn_backend,
            hf_overrides=dummy_hf_overrides,
            load_format="dummy",
            **config["runner_kwargs"],
        ) as vllm_model:
            outputs = vllm_model.generate_greedy(
                prompts,
                config["sampling_params"]["max_tokens"],
                videos=videos,
            )

            # Validate output
            assert len(outputs) == 1, f"Expected 1 output, got {len(outputs)}"
            output_ids, output_text = outputs[0]
            assert len(output_ids) > 0, "Generated no output IDs"
            assert len(output_text) > 0, "Generated empty text"
            assert isinstance(output_text, str), (
                f"Output is not string: {type(output_text)}"
            )


# Main test function
@pytest.mark.parametrize("model_key", list(MODEL_CONFIGS.keys()))
@pytest.mark.parametrize(
    "mm_encoder_attn_backend",
    [None] + current_platform.get_supported_vit_attn_backends(),
)
@pytest.mark.skip(reason="Broken test due to memory segmentation fault")
@create_new_process_for_each_test()
def test_vit_backend_functionality(
    model_key: str,
    mm_encoder_attn_backend: AttentionBackendEnum | None,
    image_assets,
    video_assets,
    vllm_runner,
    request,
):
    """Test ViT attention backend functionality for multimodal models.

    This test validates that each model can successfully generate outputs
    using different ViT attention backends. The test:
    1. Filters unsupported backends per model
    2. Applies appropriate GPU marks
    3. Routes to the correct test handler based on interface
    4. Validates output meets minimum requirements
    """
    config = MODEL_CONFIGS[model_key]

    # Step 1: Backend filtering
    if (
        "supported_backends" in config
        and mm_encoder_attn_backend is not None
        and mm_encoder_attn_backend not in config["supported_backends"]
    ):
        pytest.skip(
            f"{model_key} does not support {mm_encoder_attn_backend} backend now."
        )

    # Step 2: Apply GPU marks dynamically
    if "gpu_marks" in config:
        for mark in config["gpu_marks"]:
            request.applymarker(mark)

    # Step 3: Route to appropriate handler
    if config.get("media_type") == "video":
        run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner)
    elif config["interface"] == "llm_chat":
        run_llm_chat_test(config, mm_encoder_attn_backend, image_assets)
    elif config["interface"] == "llm_generate":
        run_llm_generate_test(config, mm_encoder_attn_backend, image_assets)
    else:
        raise ValueError(f"Unknown interface: {config['interface']}")
[CustomOp][MM] Extract MMEncoderAttention as CustomOp and replace the backend of QwenVisionAttention with it. (#30125) Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> 2025-12-15 11:13:32 +08:00			`# SPDX-License-Identifier: Apache-2.0`
			`# SPDX-FileCopyrightText: Copyright contributors to the vLLM project`
			`"""`
			`Consolidated test for ViT attention backend functionality across multiple models.`

			`This test validates that each multimodal model can successfully generate outputs`
			`using different ViT attention backends. Tests are parametrized by model and backend.`
			`"""`

			`from typing import Any`

			`import pytest`
			`from transformers import AutoProcessor`

[Mypy] Better fixes for the `mypy` issues in `vllm/config` (#37902) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2026-03-25 13:14:43 +00:00			`from vllm import LLM, SamplingParams`
[Misc] Introduce `encode_*_url` utility function (#31208) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> 2025-12-23 21:45:21 +08:00			`from vllm.multimodal.utils import encode_image_url`
[CustomOp][MM] Extract MMEncoderAttention as CustomOp and replace the backend of QwenVisionAttention with it. (#30125) Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> 2025-12-15 11:13:32 +08:00			`from vllm.multimodal.video import sample_frames_from_video`
			`from vllm.platforms import current_platform`
[1/N][Attention] Restructure attention: move files (#31916) Signed-off-by: Matthew Bonanni <mbonanni@redhat.com> 2026-01-09 16:10:24 -05:00			`from vllm.v1.attention.backends.registry import AttentionBackendEnum`
[CustomOp][MM] Extract MMEncoderAttention as CustomOp and replace the backend of QwenVisionAttention with it. (#30125) Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> 2025-12-15 11:13:32 +08:00
			`from ....utils import create_new_process_for_each_test`
			`from ...utils import dummy_hf_overrides`

			`# Dots.OCR prompt from official repository`
			`# https://github.com/rednote-hilab/dots.ocr/blob/d72d1d8c5bdd0362eb264f714cdbd1e5daa7cdff/dots_ocr/utils/prompts.py#L3`
			`# ruff: noqa: E501`
			`DOTS_OCR_PROMPT = """Please output the layout information from the PDF image, including each layout element's bbox, its category, and the corresponding text content within the bbox.`

			`1. Bbox format: [x1, y1, x2, y2]`

			`2. Layout Categories: The possible categories are ['Caption', 'Footnote', 'Formula', 'List-item', 'Page-footer', 'Page-header', 'Picture', 'Section-header', 'Table', 'Text', 'Title'].`

			`3. Text Extraction & Formatting Rules:`
			`- Picture: For the 'Picture' category, the text field should be omitted.`
			`- Formula: Format its text as LaTeX.`
			`- Table: Format its text as HTML.`
			`- All Others (Text, Title, etc.): Format their text as Markdown.`

			`4. Constraints:`
			`- The output text must be the original text from the image, with no translation.`
			`- All layout elements must be sorted according to human reading order.`

			`5. Final Output: The entire output must be a single JSON object.`
			`"""`

			`VIDEO_PLACEHOLDER = "<\|vision_start\|><\|video_pad\|><\|vision_end\|>"`


			`# Model configurations`
			`MODEL_CONFIGS: dict[str, dict[str, Any]] = {`
			`"dots_ocr": {`
			`"model_name": "rednote-hilab/dots.ocr",`
			`"interface": "llm_chat",`
			`"max_model_len": 32768,`
			`"max_num_seqs": 1,`
			`"limit_mm_per_prompt": {"image": 1},`
			`"sampling_params": {`
			`"temperature": 0.1,`
			`"max_tokens": 16384,`
			`"top_p": 0.9,`
			`"stop_token_ids": None,`
			`},`
			`"use_specific_image": "stop_sign",`
			`"prompt_builder": "build_dots_ocr_prompt",`
			`"output_validator": lambda x: len(x) > 10 and "stop" in x.lower(),`
			`},`
			`"ernie45_vl": {`
			`"model_name": "baidu/ERNIE-4.5-VL-28B-A3B-PT",`
			`"interface": "llm_generate",`
			`"max_model_len": 16384,`
			`"max_num_seqs": 2,`
			`"sampling_params": {`
			`"temperature": 0.0,`
			`"max_tokens": 256,`
			`"stop_token_ids": None,`
			`},`
			`"use_processor": True,`
			`"question": "What is the content of each image?",`
			`},`
			`"glm4_1v": {`
			`"model_name": "zai-org/GLM-4.1V-9B-Thinking",`
			`"interface": "llm_generate",`
			`"max_model_len": 32768,`
			`"max_num_seqs": 2,`
			`"sampling_params": {`
			`"temperature": 0.0,`
			`"max_tokens": 256,`
			`"stop_token_ids": None,`
			`},`
			`"use_processor": True,`
			`"question": "What is the content of each image?",`
			`},`
[GLM-OCR] GLM-OCR with MTP Support (#33005) Signed-off-by: zRzRzRzRzRzRzR <2448370773@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2026-01-26 22:24:43 +08:00			`"glm_ocr": {`
			`"model_name": "zai-org/GLM-OCR",`
			`"interface": "llm_generate",`
			`"max_model_len": 131072,`
			`"max_num_seqs": 2,`
			`"sampling_params": {`
			`"temperature": 0.0,`
			`"max_tokens": 256,`
			`"stop_token_ids": None,`
			`},`
			`"use_processor": True,`
			`"question": "Text Recognition:",`
			`},`
[CustomOp][MM] Extract MMEncoderAttention as CustomOp and replace the backend of QwenVisionAttention with it. (#30125) Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> 2025-12-15 11:13:32 +08:00			`"keye_vl": {`
			`"model_name": "Kwai-Keye/Keye-VL-8B-Preview",`
			`"interface": "llm_generate",`
			`"max_model_len": 8192,`
			`"max_num_seqs": 5,`
			`"sampling_params": {`
			`"temperature": 0.0,`
			`"max_tokens": 256,`
			`"stop_token_ids": None,`
			`},`
			`"supported_backends": {`
			`AttentionBackendEnum.FLASH_ATTN,`
			`AttentionBackendEnum.ROCM_AITER_FA,`
			`},`
			`"use_processor": True,`
			`"question": "What is the content of each image?",`
			`},`
			`"ovis2_5": {`
			`"model_name": "AIDC-AI/Ovis2.5-2B",`
			`"interface": "llm_generate",`
			`"max_model_len": 8192,`
			`"max_num_seqs": 2,`
			`"sampling_params": {`
			`"temperature": 0.0,`
			`"max_tokens": 256,`
			`"stop_token_ids": None,`
			`},`
			`"prompt_builder": "build_ovis_prompt",`
			`"question": "What is the content of each image?",`
			`},`
			`"qwen2_5_vl": {`
			`"model_name": "Qwen/Qwen2.5-VL-3B-Instruct",`
			`"interface": "vllm_runner",`
			`"media_type": "video",`
			`"max_model_len": 4000,`
			`"max_num_seqs": 1,`
			`"limit_mm_per_prompt": {"video": 1},`
			`"sampling_params": {`
			`"max_tokens": 128,`
			`},`
			`"runner_kwargs": {`
			`"runner": "generate",`
			`"dtype": "bfloat16",`
			`},`
			`"video_params": {`
			`"num_frames": 16,`
			`"pruning_rates": [0.0, 0.75],`
			`},`
			`},`
			`"qwen2_5_omni": {`
			`"model_name": "Qwen/Qwen2.5-Omni-3B",`
			`"interface": "llm_generate",`
			`"max_model_len": 32768,`
			`"max_num_seqs": 2,`
			`"limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},`
			`"sampling_params": {`
			`"temperature": 0.6,`
			`"top_p": 0.95,`
			`"top_k": 20,`
			`"max_tokens": 16384,`
			`},`
			`"use_processor": True,`
			`"question": "What is the content of each image?",`
			`},`
			`"qwen3_omni": {`
			`"model_name": "Qwen/Qwen3-Omni-30B-A3B-Instruct",`
			`"interface": "llm_generate",`
			`"max_model_len": 32768,`
			`"max_num_seqs": 2,`
			`"limit_mm_per_prompt": {"image": 3, "video": 3, "audio": 3},`
			`"sampling_params": {`
			`"temperature": 0.6,`
			`"top_p": 0.95,`
			`"top_k": 20,`
			`"max_tokens": 16384,`
			`},`
			`"use_processor": True,`
			`"question": "What is the content of each image?",`
			`},`
			`}`


			`# Prompt builder functions`
			`def build_dots_ocr_prompt(images, config):`
			`"""Build Dots.OCR specific prompt with OCR instructions."""`
			`# Use only stop_sign image for Dots.OCR`
			`image = images[0] # Already filtered to stop_sign`
[Misc] Introduce `encode_*_url` utility function (#31208) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> 2025-12-23 21:45:21 +08:00			`image_url = encode_image_url(image)`
[CustomOp][MM] Extract MMEncoderAttention as CustomOp and replace the backend of QwenVisionAttention with it. (#30125) Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> 2025-12-15 11:13:32 +08:00
			`placeholders = [{"type": "image_url", "image_url": {"url": image_url}}]`
			`messages = [`
			`{`
			`"role": "user",`
			`"content": [`
			`*placeholders,`
			`{`
			`"type": "text",`
			`"text": f"<\|img\|><\|imgpad\|><\|endofimg\|>{DOTS_OCR_PROMPT}",`
			`},`
			`],`
			`},`
			`]`

			`return messages`


			`def build_processor_prompt(images, config):`
			`"""Build prompt using AutoProcessor.apply_chat_template()."""`
			`processor = AutoProcessor.from_pretrained(`
			`config["model_name"], trust_remote_code=True`
			`)`

[Misc] Introduce `encode_*_url` utility function (#31208) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> 2025-12-23 21:45:21 +08:00			`image_urls = [encode_image_url(img) for img in images]`
[CustomOp][MM] Extract MMEncoderAttention as CustomOp and replace the backend of QwenVisionAttention with it. (#30125) Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> 2025-12-15 11:13:32 +08:00			`placeholders = [{"type": "image", "image": url} for url in image_urls]`
			`messages = [`
			`{`
			`"role": "user",`
			`"content": [`
			`*placeholders,`
			`{"type": "text", "text": config["question"]},`
			`],`
			`},`
			`]`

			`return processor.apply_chat_template(`
			`messages, tokenize=False, add_generation_prompt=True`
			`)`


			`def build_ovis_prompt(images, config):`
			`"""Build Ovis2.5 specific prompt with custom format."""`
[Misc] Introduce `encode_*_url` utility function (#31208) Signed-off-by: DarkLight1337 <tlleungac@connect.ust.hk> 2025-12-23 21:45:21 +08:00			`image_urls = [encode_image_url(img) for img in images]`
[CustomOp][MM] Extract MMEncoderAttention as CustomOp and replace the backend of QwenVisionAttention with it. (#30125) Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> 2025-12-15 11:13:32 +08:00
			`placeholders = "\n".join(`
			`f"Image-{i}: <image>\n" for i, _ in enumerate(image_urls, start=1)`
			`)`

			`return (`
			`f"<\|im_start\|>user\n\n{placeholders}\n{config['question']}<\|im_end\|>\n"`
			`"<\|im_start\|>assistant\n"`
			`)`


			`def build_qwen2_5_video_prompt():`
			`"""Build Qwen2.5-VL video prompt with EVS placeholder."""`
			`return (`
			`f"<\|im_start\|>system\nYou are a helpful assistant.<\|im_end\|>\n"`
			`f"<\|im_start\|>user\n{VIDEO_PLACEHOLDER}"`
			`"Describe this video with a short sentence (no more than 20 words)"`
			`"<\|im_end\|><\|im_start\|>assistant\n"`
			`)`


			`# Handler functions`
			`def run_llm_generate_test(config, mm_encoder_attn_backend, image_assets):`
			`"""Standard LLM.generate() interface handler."""`
			`images = [asset.pil_image for asset in image_assets]`

			`# Build prompt`
			`if config.get("use_processor"):`
			`prompt = build_processor_prompt(images, config)`
			`else:`
			`prompt_builder_name = config.get("prompt_builder", "build_ovis_prompt")`
			`prompt_builder = globals()[prompt_builder_name]`
			`prompt = prompt_builder(images, config)`

			`# Determine limit_mm_per_prompt`
			`limit_mm_per_prompt = config.get("limit_mm_per_prompt", {"image": len(images)})`

			`# Create engine`
[Mypy] Better fixes for the `mypy` issues in `vllm/config` (#37902) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2026-03-25 13:14:43 +00:00			`llm = LLM(`
[CustomOp][MM] Extract MMEncoderAttention as CustomOp and replace the backend of QwenVisionAttention with it. (#30125) Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> 2025-12-15 11:13:32 +08:00			`model=config["model_name"],`
			`trust_remote_code=True,`
			`max_model_len=config["max_model_len"],`
			`max_num_seqs=config["max_num_seqs"],`
			`limit_mm_per_prompt=limit_mm_per_prompt,`
			`mm_encoder_attn_backend=mm_encoder_attn_backend,`
			`hf_overrides=dummy_hf_overrides,`
			`load_format="dummy",`
[Mypy] Better fixes for the `mypy` issues in `vllm/config` (#37902) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2026-03-25 13:14:43 +00:00			`seed=42,`
[CustomOp][MM] Extract MMEncoderAttention as CustomOp and replace the backend of QwenVisionAttention with it. (#30125) Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> 2025-12-15 11:13:32 +08:00			`)`

			`# Generate`
			`sampling_params = SamplingParams(**config["sampling_params"])`
			`outputs = llm.generate(`
			`{`
			`"prompt": prompt,`
			`"multi_modal_data": {"image": images},`
			`},`
			`sampling_params=sampling_params,`
			`)`

			`# Validate`
			`for o in outputs:`
			`generated_text = o.outputs[0].text`
			`validator = config.get("output_validator", lambda x: len(x) > 10)`
			`assert validator(generated_text), (`
			`f"Validation failed for {config['model_name']}: {generated_text}"`
			`)`


			`def run_llm_chat_test(config, mm_encoder_attn_backend, image_assets):`
			`"""LLM.chat() interface handler for Dots.OCR."""`
			`# Filter to stop_sign image only`
			`stop_sign_image = [`
			`asset.pil_image for asset in image_assets if asset.name == "stop_sign"`
			`][0]`

			`# Build messages`
			`messages = build_dots_ocr_prompt([stop_sign_image], config)`

			`# Create engine`
[Mypy] Better fixes for the `mypy` issues in `vllm/config` (#37902) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2026-03-25 13:14:43 +00:00			`llm = LLM(`
[CustomOp][MM] Extract MMEncoderAttention as CustomOp and replace the backend of QwenVisionAttention with it. (#30125) Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> 2025-12-15 11:13:32 +08:00			`model=config["model_name"],`
			`trust_remote_code=True,`
			`max_model_len=config["max_model_len"],`
			`max_num_seqs=config["max_num_seqs"],`
			`limit_mm_per_prompt=config["limit_mm_per_prompt"],`
			`mm_encoder_attn_backend=mm_encoder_attn_backend,`
			`hf_overrides=dummy_hf_overrides,`
			`load_format="dummy",`
[Mypy] Better fixes for the `mypy` issues in `vllm/config` (#37902) Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> 2026-03-25 13:14:43 +00:00			`seed=42,`
[CustomOp][MM] Extract MMEncoderAttention as CustomOp and replace the backend of QwenVisionAttention with it. (#30125) Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> 2025-12-15 11:13:32 +08:00			`)`

			`# Generate using chat`
			`sampling_params = SamplingParams(**config["sampling_params"])`
			`outputs = llm.chat(messages=messages, sampling_params=sampling_params)`

			`# Validate`
			`for o in outputs:`
			`generated_text = o.outputs[0].text`
			`validator = config.get("output_validator", lambda x: len(x) > 10)`
			`assert validator(generated_text), (`
			`f"Validation failed for {config['model_name']}: {generated_text}"`
			`)`


			`def run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner):`
			`"""Video test with EVS (Efficient Video Sampling) handler."""`
			`for pruning_rate in config["video_params"]["pruning_rates"]:`
			`num_frames = config["video_params"]["num_frames"]`

			`# Sample frames from video`
			`sampled_vids = [`
			`sample_frames_from_video(asset.np_ndarrays, num_frames)`
			`for asset in video_assets`
			`]`

			`# Build prompt and prepare video`
			`prompt = build_qwen2_5_video_prompt()`
			`prompts = [prompt]`
			`videos = [sampled_vids[0]]`

			`# Run with vllm_runner context manager`
			`with vllm_runner(`
			`config["model_name"],`
			`max_model_len=config["max_model_len"],`
			`max_num_seqs=config["max_num_seqs"],`
			`limit_mm_per_prompt=config["limit_mm_per_prompt"],`
			`tensor_parallel_size=1,`
			`video_pruning_rate=pruning_rate,`
			`mm_encoder_attn_backend=mm_encoder_attn_backend,`
			`hf_overrides=dummy_hf_overrides,`
			`load_format="dummy",`
			`**config["runner_kwargs"],`
			`) as vllm_model:`
			`outputs = vllm_model.generate_greedy(`
			`prompts,`
			`config["sampling_params"]["max_tokens"],`
			`videos=videos,`
			`)`

			`# Validate output`
			`assert len(outputs) == 1, f"Expected 1 output, got {len(outputs)}"`
			`output_ids, output_text = outputs[0]`
			`assert len(output_ids) > 0, "Generated no output IDs"`
			`assert len(output_text) > 0, "Generated empty text"`
			`assert isinstance(output_text, str), (`
			`f"Output is not string: {type(output_text)}"`
			`)`


			`# Main test function`
			`@pytest.mark.parametrize("model_key", list(MODEL_CONFIGS.keys()))`
			`@pytest.mark.parametrize(`
			`"mm_encoder_attn_backend",`
			`[None] + current_platform.get_supported_vit_attn_backends(),`
			`)`
[CI/Build] Skip broken ViT backend functionality test tempoarily (#30782) Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> 2025-12-16 22:45:25 +08:00			`@pytest.mark.skip(reason="Broken test due to memory segmentation fault")`
[CustomOp][MM] Extract MMEncoderAttention as CustomOp and replace the backend of QwenVisionAttention with it. (#30125) Signed-off-by: shen-shanshan <467638484@qq.com> Signed-off-by: Isotr0py <mozf@mail2.sysu.edu.cn> Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: Isotr0py <mozf@mail2.sysu.edu.cn> Co-authored-by: tjtanaa <tunjian.tan@embeddedllm.com> 2025-12-15 11:13:32 +08:00			`@create_new_process_for_each_test()`
			`def test_vit_backend_functionality(`
			`model_key: str,`
			`mm_encoder_attn_backend: AttentionBackendEnum \| None,`
			`image_assets,`
			`video_assets,`
			`vllm_runner,`
			`request,`
			`):`
			`"""Test ViT attention backend functionality for multimodal models.`

			`This test validates that each model can successfully generate outputs`
			`using different ViT attention backends. The test:`
			`1. Filters unsupported backends per model`
			`2. Applies appropriate GPU marks`
			`3. Routes to the correct test handler based on interface`
			`4. Validates output meets minimum requirements`
			`"""`
			`config = MODEL_CONFIGS[model_key]`

			`# Step 1: Backend filtering`
			`if (`
			`"supported_backends" in config`
			`and mm_encoder_attn_backend is not None`
			`and mm_encoder_attn_backend not in config["supported_backends"]`
			`):`
			`pytest.skip(`
			`f"{model_key} does not support {mm_encoder_attn_backend} backend now."`
			`)`

			`# Step 2: Apply GPU marks dynamically`
			`if "gpu_marks" in config:`
			`for mark in config["gpu_marks"]:`
			`request.applymarker(mark)`

			`# Step 3: Route to appropriate handler`
			`if config.get("media_type") == "video":`
			`run_video_test(config, mm_encoder_attn_backend, video_assets, vllm_runner)`
			`elif config["interface"] == "llm_chat":`
			`run_llm_chat_test(config, mm_encoder_attn_backend, image_assets)`
			`elif config["interface"] == "llm_generate":`
			`run_llm_generate_test(config, mm_encoder_attn_backend, image_assets)`
			`else:`
			`raise ValueError(f"Unknown interface: {config['interface']}")`