[Core] Initialize LoRA support for tower and connector in multi-modal models (#26674)

Signed-off-by: bk-201 <joy25810@foxmail.com> Signed-off-by: Jee Jee Li <pandaleefree@gmail.com> Signed-off-by: prashanth058 <prashanth.dannamaneni@uipath.com> Co-authored-by: bk-201 <joy25810@foxmail.com> Co-authored-by: prashanth058 <prashanth.dannamaneni@uipath.com> Co-authored-by: Anexdeus <5142168@mail.ru>
2025-12-26 20:48:20 +08:00
parent 0b544e6476
commit ce1eafd1a5
20 changed files with 635 additions and 80 deletions
--- a/tests/lora/test_qwenvl.py
+++ b/tests/lora/test_qwenvl.py
@@ -0,0 +1,298 @@
+# SPDX-License-Identifier: Apache-2.0
+# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
+from dataclasses import dataclass
+
+import vllm
+from vllm.assets.image import ImageAsset
+from vllm.lora.request import LoRARequest
+from vllm.sampling_params import BeamSearchParams
+
+
+@dataclass
+class TestConfig:
+    model_path: str
+    lora_path: str
+    max_num_seqs: int = 2
+    max_loras: int = 2
+    max_lora_rank: int = 32
+    enable_tower_connector_lora: bool = False
+    max_model_len: int = 8192
+    gpu_memory_utilization: float = 0.85
+    mm_processor_kwargs: dict[str, int] | None = None
+    mm_processor_cache_gb: float = 4
+
+    def __post_init__(self):
+        if self.mm_processor_kwargs is None:
+            self.mm_processor_kwargs = {
+                "min_pixels": 28 * 28,
+                "max_pixels": 1280 * 28 * 28,
+            }
+
+
+class Qwen2VLTester:
+    """Test helper for Qwen2 VL models with LoRA"""
+
+    PROMPT_TEMPLATE = (
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>"
+        "\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>"
+        "What is in the image?<|im_end|>\n"
+        "<|im_start|>assistant\n"
+    )
+
+    def __init__(self, config: TestConfig):
+        self.config = config
+        self.llm = self._initialize_llm()
+
+    def _initialize_llm(self) -> vllm.LLM:
+        """Initialize the LLM with given configuration"""
+        return vllm.LLM(
+            model=self.config.model_path,
+            max_num_seqs=self.config.max_num_seqs,
+            enable_lora=True,
+            max_loras=self.config.max_loras,
+            max_lora_rank=self.config.max_lora_rank,
+            enable_tower_connector_lora=self.config.enable_tower_connector_lora,
+            trust_remote_code=True,
+            gpu_memory_utilization=self.config.gpu_memory_utilization,
+            mm_processor_kwargs=self.config.mm_processor_kwargs,
+            mm_processor_cache_gb=self.config.mm_processor_cache_gb,
+            max_model_len=self.config.max_model_len,
+        )
+
+    def run_test(
+        self,
+        images: list[ImageAsset],
+        expected_outputs: list[str],
+        lora_id: int | None = None,
+        lora_name: str | None = None,
+        temperature: float = 0,
+        max_tokens: int = 5,
+    ):
+        sampling_params = vllm.SamplingParams(
+            temperature=temperature,
+            max_tokens=max_tokens,
+        )
+        inputs = [
+            {
+                "prompt": self.PROMPT_TEMPLATE,
+                "multi_modal_data": {"image": asset.pil_image},
+            }
+            for asset in images
+        ]
+
+        lora_request = LoRARequest(
+            lora_name if lora_name else str(lora_id), lora_id, self.config.lora_path
+        )
+        outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request)
+        generated_texts = [output.outputs[0].text.strip() for output in outputs]
+        # Validate outputs
+        for generated, expected in zip(generated_texts, expected_outputs):
+            assert expected.startswith(generated), (
+                f"Generated text {generated} doesn't "
+            )
+            f"match expected pattern {expected}"
+
+    def run_beam_search_test(
+        self,
+        images: list[ImageAsset],
+        expected_outputs: list[list[str]],
+        lora_id: int | None = None,
+        temperature: float = 0,
+        beam_width: int = 2,
+        max_tokens: int = 5,
+    ):
+        beam_search_params = BeamSearchParams(
+            beam_width=beam_width, max_tokens=max_tokens, temperature=temperature
+        )
+
+        inputs = [
+            {
+                "prompt": self.PROMPT_TEMPLATE,
+                "multi_modal_data": {"image": asset.pil_image},
+            }
+            for asset in images
+        ]
+
+        lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
+        outputs = self.llm.beam_search(
+            inputs, beam_search_params, lora_request=lora_request
+        )
+
+        for output_obj, expected_outs in zip(outputs, expected_outputs):
+            output_texts = [seq.text for seq in output_obj.sequences]
+            assert output_texts == expected_outs, (
+                f"Generated texts {output_texts} do not match expected {expected_outs}"
+            )  # noqa: E501
+
+
+TEST_IMAGES = [
+    ImageAsset("stop_sign"),
+    ImageAsset("cherry_blossom"),
+]
+
+EXPECTED_OUTPUTS = [
+    "A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements.",  # noqa: E501
+    "A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.",  # noqa: E501
+]
+
+EXPECTED_OUTPUTS_LANGUAGE = [
+    "A stop sign is shown in an Asian city, with buildings and a car in the "
+    "background.",
+    "The Tokyo Skytree can be seen behind the pink blossoms of the cherry trees.",
+]
+
+EXPECTED_OUTPUTS_VISION = [
+    "A stop sign in front of oriental buildings.",
+    "A tree with pink flowers in front of it and a blue sky behind the flowers.",
+]
+
+EXPECTED_OUTPUTS_VISION_NO_CONNECTOR = [
+    "A stop sign is located on the street of a Chinese neighborhood.",
+    "A closeup shot of the Tokyo Skytree with pink flowers in the foreground.",
+]
+
+# NOTE - beam search .text contains the whole text
+EXPECTED_BEAM_SEARCH_OUTPUTS = [
+    [
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic skyscraper stands",  # noqa: E501
+        "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\n<|vision_start|><|image_pad|><|vision_end|>What is in the image?<|im_end|>\n<|im_start|>assistant\nA majestic tower stands tall",  # noqa: E501
+    ],
+]
+
+QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
+QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
+QWEN3VL_MODEL_PATH = "Qwen/Qwen3-VL-4B-Instruct"
+
+
+def test_qwen2vl_lora(qwen2vl_lora_files):
+    """Test Qwen 2.0 VL model with LoRA"""
+    config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
+    tester = Qwen2VLTester(config)
+
+    # Test with different LoRA IDs
+    for lora_id in [1, 2]:
+        tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
+
+
+def test_qwen2vl_lora_beam_search(qwen2vl_lora_files):
+    """Test Qwen 2.0 VL model with LoRA through beam search."""
+    config = TestConfig(model_path=QWEN2VL_MODEL_PATH, lora_path=qwen2vl_lora_files)
+    tester = Qwen2VLTester(config)
+
+    # Test with different LoRA IDs
+    for lora_id in [1, 2]:
+        # NOTE currently, we only test cherry blossom since stop sign
+        # output is slightly different for v1; - the root cause is likely
+        # independent of the intent of this test, which is to ensure beam
+        # search passes through lora through correctly.
+        tester.run_beam_search_test(
+            [ImageAsset("cherry_blossom")],
+            expected_outputs=EXPECTED_BEAM_SEARCH_OUTPUTS,
+            lora_id=lora_id,
+        )
+
+
+def test_qwen25vl_lora(qwen25vl_lora_files):
+    """Test Qwen 2.5 VL model with LoRA"""
+    config = TestConfig(model_path=QWEN25VL_MODEL_PATH, lora_path=qwen25vl_lora_files)
+    tester = Qwen2VLTester(config)
+
+    # Test with different LoRA IDs
+    for lora_id in [1, 2]:
+        tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
+
+
+def test_qwen25vl_vision_lora(qwen25vl_vision_lora_files):
+    config = TestConfig(
+        model_path=QWEN25VL_MODEL_PATH,
+        lora_path=qwen25vl_vision_lora_files,
+        # Currently, tower_connector_lora is incompatible with
+        # the multi-modal processor cache.
+        # TODO: Remove this restriction
+        mm_processor_cache_gb=0,
+        enable_tower_connector_lora=True,
+    )
+    tester = Qwen2VLTester(config)
+    for lora_id in [1, 2]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS,
+            lora_id=lora_id,
+        )
+
+
+def test_qwen3vl_vision_lora(qwen3vl_vision_lora_files):
+    config = TestConfig(
+        model_path=QWEN3VL_MODEL_PATH,
+        lora_path=qwen3vl_vision_lora_files,
+        # Currently, tower_connector_lora is incompatible with
+        # the multi-modal processor cache.
+        # TODO: Remove this restriction
+        mm_processor_cache_gb=0,
+        enable_tower_connector_lora=True,
+    )
+    tester = Qwen2VLTester(config)
+    for lora_id in [1, 2]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS,
+            lora_id=lora_id,
+        )
+
+
+def test_qwen2vl_multiple_lora_types(
+    qwen2vl_language_lora_files,
+    qwen2vl_vision_tower_connector_lora_files,
+    qwen2vl_vision_tower_lora_files,
+):
+    """
+    Test multiple LoRA adapter types (language, vision tower + connector,
+    vision tower only) using the same LLM instance to verify mm_encoder_cache
+    behavior with different LoRA requests.
+
+    By reusing the same LLM instance across different LoRA requests, we ensure that
+    the multimodal encoder cache correctly manages state transitions between
+    language-only and vision-enabled LoRA adapters.
+    """
+    config = TestConfig(
+        model_path=QWEN2VL_MODEL_PATH,
+        # We'll override the lora_path for each specific test, but need to provide
+        # an initial path for initialization
+        lora_path=qwen2vl_language_lora_files,
+        # Currently, tower_connector_lora is incompatible with
+        # the multi-modal processor cache.
+        # TODO: Remove this restriction
+        mm_processor_cache_gb=0,
+        enable_tower_connector_lora=True,
+    )
+    tester = Qwen2VLTester(config)
+
+    # Test 1: Language-only LoRA adapter
+    tester.config.lora_path = qwen2vl_language_lora_files
+    for lora_id in [1, 2]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS_LANGUAGE,
+            lora_id=lora_id,
+            lora_name="language_only",
+        )
+
+    # Test 2: Vision tower + connector LoRA adapter
+    tester.config.lora_path = qwen2vl_vision_tower_connector_lora_files
+    for lora_id in [3, 4]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS_VISION,
+            lora_id=lora_id,
+            lora_name="vision_tower_connector",
+        )
+
+    # Test 3: Vision tower only LoRA adapter (no connector)
+    tester.config.lora_path = qwen2vl_vision_tower_lora_files
+    for lora_id in [5, 6]:
+        tester.run_test(
+            TEST_IMAGES,
+            expected_outputs=EXPECTED_OUTPUTS_VISION_NO_CONNECTOR,
+            lora_id=lora_id,
+            lora_name="vision_tower",
+        )