tests/lora/test_qwenvl.py

# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
from dataclasses import dataclass

from packaging.version import Version
from transformers import __version__ as TRANSFORMERS_VERSION

import vllm
from vllm.assets.image import ImageAsset
from vllm.lora.request import LoRARequest
from vllm.sampling_params import BeamSearchParams


@dataclass
class TestConfig:
    model_path: str
    lora_path: str
    max_num_seqs: int = 2
    max_loras: int = 2
    max_lora_rank: int = 32
    enable_tower_connector_lora: bool = False
    max_model_len: int = 8192
    gpu_memory_utilization: float = 0.85
    mm_processor_kwargs: dict[str, object] | None = None
    mm_processor_cache_gb: float = 4

    def __post_init__(self):
        if self.mm_processor_kwargs is None:
            # There is a bug in transformers v4 where size is ignored by
            # `Qwen2VLProcessor.__call__`
            if Version(TRANSFORMERS_VERSION) < Version("5.2.0"):
                self.mm_processor_kwargs = {
                    "min_pixels": 28 * 28,
                    "max_pixels": 1280 * 28 * 28,
                }
            else:
                self.mm_processor_kwargs = {
                    "size": {
                        "shortest_edge": 28 * 28,
                        "longest_edge": 1280 * 28 * 28,
                    }
                }


class Qwen2VLTester:
    """Test helper for Qwen2 VL models with LoRA"""

    PROMPT_TEMPLATE = (
        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
        "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
        "What is in the image?<|im_end|>\n"
        "<|im_start|>assistant\n"
    )

    def __init__(self, config: TestConfig):
        self.config = config
        self.llm = self._initialize_llm()

    def _initialize_llm(self) -> vllm.LLM:
        """Initialize the LLM with given configuration"""
        return vllm.LLM(
            model=self.config.model_path,
            max_num_seqs=self.config.max_num_seqs,
            enable_lora=True,
            max_loras=self.config.max_loras,
            max_lora_rank=self.config.max_lora_rank,
            enable_tower_connector_lora=self.config.enable_tower_connector_lora,
            trust_remote_code=True,
            gpu_memory_utilization=self.config.gpu_memory_utilization,
            mm_processor_kwargs=self.config.mm_processor_kwargs,
            mm_processor_cache_gb=self.config.mm_processor_cache_gb,
            max_model_len=self.config.max_model_len,
        )

    def run_test(
        self,
        images: list[ImageAsset],
        expected_outputs: list[str],
        lora_id: int | None = None,
        lora_name: str | None = None,
        temperature: float = 0,
        max_tokens: int = 5,
    ):
        sampling_params = vllm.SamplingParams(
            temperature=temperature,
            max_tokens=max_tokens,
        )
        inputs = [
            {
                "prompt": self.PROMPT_TEMPLATE,
                "multi_modal_data": {"image": asset.pil_image},
            }
            for asset in images
        ]

        lora_request = LoRARequest(
            lora_name if lora_name else str(lora_id), lora_id, self.config.lora_path
        )
        outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request)
        generated_texts = [output.outputs[0].text.strip() for output in outputs]
        # Validate outputs
        for generated, expected in zip(generated_texts, expected_outputs):
            assert expected.startswith(generated), (
                f"Generated text {generated} doesn't match expected pattern {expected}"
            )

    def run_beam_search_test(
        self,
        images: list[ImageAsset],
        expected_outputs: list[list[str]],
        lora_id: int | None = None,
        temperature: float = 0,
        beam_width: int = 2,
        max_tokens: int = 5,
    ):
        beam_search_params = BeamSearchParams(
            beam_width=beam_width, max_tokens=max_tokens, temperature=temperature
        )

        inputs = [
            {
                "prompt": self.PROMPT_TEMPLATE,
                "multi_modal_data": {"image": asset.pil_image},
            }
            for asset in images
        ]

        lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
        outputs = self.llm.beam_search(
            inputs, beam_search_params, lora_request=lora_request
        )

        for output_obj, expected_texts in zip(outputs, expected_outputs):
            output_texts = [seq.text for seq in output_obj.sequences]

            for output_text, expected_text in zip(output_texts, expected_texts):
                # NOTE beam search .text contains the whole text including inputs
                assert output_text.endswith(expected_text), (
                    f"Generated {output_text} does not match expected {expected_text}"
                )


TEST_IMAGES = [
    ImageAsset("stop_sign"),
    ImageAsset("cherry_blossom"),
]

EXPECTED_OUTPUTS = [
    "A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.",  # noqa: E501
    "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.",  # noqa: E501
]

EXPECTED_OUTPUTS_LANGUAGE = [
    "A stop sign is shown in an Asian city, with buildings and a car in the "
    "background.",
    "The Tokyo Skytree can be seen behind the pink blossoms of the cherry trees.",
]

EXPECTED_OUTPUTS_VISION = [
    "A stop sign in front of oriental buildings.",
    "A tree with pink flowers in front of it and a blue sky behind the flowers.",
]

EXPECTED_OUTPUTS_VISION_NO_CONNECTOR = [
    "A stop sign is located on the street of a Chinese neighborhood.",
    "A closeup shot of the Tokyo Skytree with pink flowers in the foreground.",
]

EXPECTED_BEAM_SEARCH_OUTPUTS = [
    [
        "A majestic skyscraper stands",
        "A majestic tower stands tall",
    ],
]

QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
QWEN3VL_MODEL_PATH = "Qwen/Qwen3-VL-4B-Instruct"


def test_qwen2vl_lora(qwen2vl_lora_files):
    """Test Qwen 2.0 VL model with LoRA"""
    config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
    tester = Qwen2VLTester(config)

    # Test with different LoRA IDs
    for lora_id in [1, 2]:
        tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)


def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
    """Test Qwen 2.0 VL model with LoRA through beam search."""
    config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
    tester = Qwen2VLTester(config)

    # Test with different LoRA IDs
    for lora_id in [1, 2]:
        # NOTE currently, we only test cherry blossom since stop sign
        # output is slightly different for v1; - the root cause is likely
        # independent of the intent of this test, which is to ensure beam
        # search passes through lora through correctly.
        tester.run_beam_search_test(
            [ImageAsset("cherry_blossom")],
            expected_outputs=EXPECTED_BEAM_SEARCH_OUTPUTS,
            lora_id=lora_id,
        )


def test_qwen25vl_lora(qwen25vl_lora_files):
    """Test Qwen 2.5 VL model with LoRA"""
    config = TestConfig(model_path=QWEN25VL_MODEL_PATH, lora_path=qwen25vl_lora_files)
    tester = Qwen2VLTester(config)

    # Test with different LoRA IDs
    for lora_id in [1, 2]:
        tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)


def test_qwen25vl_vision_lora(qwen25vl_vision_lora_files):
    config = TestConfig(
        model_path=QWEN25VL_MODEL_PATH,
        lora_path=qwen25vl_vision_lora_files,
        # Currently, tower_connector_lora is incompatible with
        # the multi-modal processor cache.
        # TODO: Remove this restriction
        mm_processor_cache_gb=0,
        enable_tower_connector_lora=True,
    )
    tester = Qwen2VLTester(config)
    for lora_id in [1, 2]:
        tester.run_test(
            TEST_IMAGES,
            expected_outputs=EXPECTED_OUTPUTS,
            lora_id=lora_id,
        )


def test_qwen3vl_vision_lora(qwen3vl_vision_lora_files):
    config = TestConfig(
        model_path=QWEN3VL_MODEL_PATH,
        lora_path=qwen3vl_vision_lora_files,
        # Currently, tower_connector_lora is incompatible with
        # the multi-modal processor cache.
        # TODO: Remove this restriction
        mm_processor_cache_gb=0,
        enable_tower_connector_lora=True,
    )
    tester = Qwen2VLTester(config)
    for lora_id in [1, 2]:
        tester.run_test(
            TEST_IMAGES,
            expected_outputs=EXPECTED_OUTPUTS,
            lora_id=lora_id,
        )


def test_qwen2vl_multiple_lora_types(
    qwen2vl_language_lora_files,
    qwen2vl_vision_tower_connector_lora_files,
    qwen2vl_vision_tower_lora_files,
):
    """
    Test multiple LoRA adapter types (language, vision tower + connector,
    vision tower only) using the same LLM instance to verify mm_encoder_cache
    behavior with different LoRA requests.

    By reusing the same LLM instance across different LoRA requests, we ensure that
    the multimodal encoder cache correctly manages state transitions between
    language-only and vision-enabled LoRA adapters.
    """
    config = TestConfig(
        model_path=QWEN2VL_MODEL_PATH,
        # We'll override the lora_path for each specific test, but need to provide
        # an initial path for initialization
        lora_path=qwen2vl_language_lora_files,
        # Currently, tower_connector_lora is incompatible with
        # the multi-modal processor cache.
        # TODO: Remove this restriction
        mm_processor_cache_gb=0,
        enable_tower_connector_lora=True,
    )
    tester = Qwen2VLTester(config)

    # Test 1: Language-only LoRA adapter
    tester.config.lora_path = qwen2vl_language_lora_files
    for lora_id in [1, 2]:
        tester.run_test(
            TEST_IMAGES,
            expected_outputs=EXPECTED_OUTPUTS_LANGUAGE,
            lora_id=lora_id,
            lora_name="language_only",
        )

    # Test 2: Vision tower + connector LoRA adapter
    tester.config.lora_path = qwen2vl_vision_tower_connector_lora_files
    for lora_id in [3, 4]:
        tester.run_test(
            TEST_IMAGES,
            expected_outputs=EXPECTED_OUTPUTS_VISION,
            lora_id=lora_id,
            lora_name="vision_tower_connector",
        )

    # Test 3: Vision tower only LoRA adapter (no connector)
    tester.config.lora_path = qwen2vl_vision_tower_lora_files
    for lora_id in [5, 6]:
        tester.run_test(
            TEST_IMAGES,
            expected_outputs=EXPECTED_OUTPUTS_VISION_NO_CONNECTOR,
            lora_id=lora_id,
            lora_name="vision_tower",
        )