2025-02-02 14:58:18 -05:00
# SPDX-License-Identifier: Apache-2.0
2025-06-03 11:20:17 -07:00
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
2025-02-20 10:37:55 +08:00
from dataclasses import dataclass
2024-12-24 17:56:10 +08:00
2026-02-26 21:05:46 +08:00
from packaging . version import Version
from transformers import __version__ as TRANSFORMERS_VERSION
2024-12-24 17:56:10 +08:00
import vllm
from vllm . assets . image import ImageAsset
from vllm . lora . request import LoRARequest
2025-05-28 09:58:24 -06:00
from vllm . sampling_params import BeamSearchParams
2024-12-24 17:56:10 +08:00
2025-02-20 10:37:55 +08:00
@dataclass
class TestConfig :
model_path : str
lora_path : str
max_num_seqs : int = 2
max_loras : int = 2
2025-12-26 20:48:20 +08:00
max_lora_rank : int = 32
enable_tower_connector_lora : bool = False
max_model_len : int = 8192
gpu_memory_utilization : float = 0.85
2026-02-26 21:05:46 +08:00
mm_processor_kwargs : dict [ str , object ] | None = None
2025-12-26 20:48:20 +08:00
mm_processor_cache_gb : float = 4
2025-02-20 10:37:55 +08:00
def __post_init__ ( self ) :
if self . mm_processor_kwargs is None :
2026-02-26 21:05:46 +08:00
# There is a bug in transformers v4 where size is ignored by
# `Qwen2VLProcessor.__call__`
if Version ( TRANSFORMERS_VERSION ) < Version ( " 5.2.0 " ) :
self . mm_processor_kwargs = {
" min_pixels " : 28 * 28 ,
" max_pixels " : 1280 * 28 * 28 ,
}
else :
self . mm_processor_kwargs = {
" size " : {
" shortest_edge " : 28 * 28 ,
" longest_edge " : 1280 * 28 * 28 ,
}
}
2025-02-20 10:37:55 +08:00
class Qwen2VLTester :
""" Test helper for Qwen2 VL models with LoRA """
PROMPT_TEMPLATE = (
" <|im_start|>system \n You are a helpful assistant.<|im_end|> "
" \n <|im_start|>user \n <|vision_start|><|image_pad|><|vision_end|> "
" What is in the image?<|im_end|> \n "
" <|im_start|>assistant \n "
)
def __init__ ( self , config : TestConfig ) :
self . config = config
self . llm = self . _initialize_llm ( )
def _initialize_llm ( self ) - > vllm . LLM :
""" Initialize the LLM with given configuration """
return vllm . LLM (
model = self . config . model_path ,
max_num_seqs = self . config . max_num_seqs ,
enable_lora = True ,
max_loras = self . config . max_loras ,
max_lora_rank = self . config . max_lora_rank ,
2025-12-26 20:48:20 +08:00
enable_tower_connector_lora = self . config . enable_tower_connector_lora ,
2025-02-20 10:37:55 +08:00
trust_remote_code = True ,
2025-12-26 20:48:20 +08:00
gpu_memory_utilization = self . config . gpu_memory_utilization ,
2025-02-20 10:37:55 +08:00
mm_processor_kwargs = self . config . mm_processor_kwargs ,
2025-12-26 20:48:20 +08:00
mm_processor_cache_gb = self . config . mm_processor_cache_gb ,
2025-02-20 10:37:55 +08:00
max_model_len = self . config . max_model_len ,
)
def run_test (
self ,
2025-03-03 01:34:51 +00:00
images : list [ ImageAsset ] ,
expected_outputs : list [ str ] ,
2025-02-20 10:37:55 +08:00
lora_id : int | None = None ,
2025-12-26 20:48:20 +08:00
lora_name : str | None = None ,
2025-02-20 10:37:55 +08:00
temperature : float = 0 ,
2025-05-28 09:58:24 -06:00
max_tokens : int = 5 ,
) :
2025-02-20 10:37:55 +08:00
sampling_params = vllm . SamplingParams (
temperature = temperature ,
max_tokens = max_tokens ,
)
inputs = [
{
" prompt " : self . PROMPT_TEMPLATE ,
" multi_modal_data " : { " image " : asset . pil_image } ,
}
for asset in images
]
2024-12-24 17:56:10 +08:00
2025-12-26 20:48:20 +08:00
lora_request = LoRARequest (
lora_name if lora_name else str ( lora_id ) , lora_id , self . config . lora_path
)
2025-02-20 10:37:55 +08:00
outputs = self . llm . generate ( inputs , sampling_params , lora_request = lora_request )
generated_texts = [ output . outputs [ 0 ] . text . strip ( ) for output in outputs ]
# Validate outputs
for generated , expected in zip ( generated_texts , expected_outputs ) :
assert expected . startswith ( generated ) , (
2026-02-19 05:22:31 +08:00
f " Generated text { generated } doesn ' t match expected pattern { expected } "
2025-10-05 15:06:22 +01:00
)
2025-02-20 10:37:55 +08:00
2025-05-28 09:58:24 -06:00
def run_beam_search_test (
self ,
images : list [ ImageAsset ] ,
expected_outputs : list [ list [ str ] ] ,
lora_id : int | None = None ,
temperature : float = 0 ,
beam_width : int = 2 ,
max_tokens : int = 5 ,
) :
beam_search_params = BeamSearchParams (
beam_width = beam_width , max_tokens = max_tokens , temperature = temperature
)
2025-10-05 15:06:22 +01:00
2025-05-28 09:58:24 -06:00
inputs = [
{
" prompt " : self . PROMPT_TEMPLATE ,
" multi_modal_data " : { " image " : asset . pil_image } ,
}
for asset in images
]
2025-10-05 15:06:22 +01:00
2025-05-28 09:58:24 -06:00
lora_request = LoRARequest ( str ( lora_id ) , lora_id , self . config . lora_path )
outputs = self . llm . beam_search (
inputs , beam_search_params , lora_request = lora_request
)
2026-02-19 05:22:31 +08:00
for output_obj , expected_texts in zip ( outputs , expected_outputs ) :
2025-05-28 09:58:24 -06:00
output_texts = [ seq . text for seq in output_obj . sequences ]
2026-02-19 05:22:31 +08:00
for output_text , expected_text in zip ( output_texts , expected_texts ) :
# NOTE beam search .text contains the whole text including inputs
assert output_text . endswith ( expected_text ) , (
f " Generated { output_text } does not match expected { expected_text } "
)
2025-02-20 10:37:55 +08:00
TEST_IMAGES = [
2024-12-24 17:56:10 +08:00
ImageAsset ( " stop_sign " ) ,
ImageAsset ( " cherry_blossom " ) ,
]
2025-02-20 10:37:55 +08:00
EXPECTED_OUTPUTS = [
2024-12-26 15:52:48 +08:00
" A red stop sign stands prominently in the foreground, with a traditional Chinese gate and a black SUV in the background, illustrating a blend of modern and cultural elements. " , # noqa: E501
2024-12-24 17:56:10 +08:00
" A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky. " , # noqa: E501
]
2025-12-26 20:48:20 +08:00
EXPECTED_OUTPUTS_LANGUAGE = [
" A stop sign is shown in an Asian city, with buildings and a car in the "
" background. " ,
" The Tokyo Skytree can be seen behind the pink blossoms of the cherry trees. " ,
]
EXPECTED_OUTPUTS_VISION = [
" A stop sign in front of oriental buildings. " ,
" A tree with pink flowers in front of it and a blue sky behind the flowers. " ,
]
EXPECTED_OUTPUTS_VISION_NO_CONNECTOR = [
" A stop sign is located on the street of a Chinese neighborhood. " ,
" A closeup shot of the Tokyo Skytree with pink flowers in the foreground. " ,
]
2025-05-28 09:58:24 -06:00
EXPECTED_BEAM_SEARCH_OUTPUTS = [
[
2026-02-19 05:22:31 +08:00
" A majestic skyscraper stands " ,
" A majestic tower stands tall " ,
2025-05-28 09:58:24 -06:00
] ,
]
2025-02-20 10:37:55 +08:00
QWEN2VL_MODEL_PATH = " Qwen/Qwen2-VL-2B-Instruct "
QWEN25VL_MODEL_PATH = " Qwen/Qwen2.5-VL-3B-Instruct "
2025-12-26 20:48:20 +08:00
QWEN3VL_MODEL_PATH = " Qwen/Qwen3-VL-4B-Instruct "
2024-12-24 17:56:10 +08:00
def test_qwen2vl_lora ( qwen2vl_lora_files ) :
2025-02-20 10:37:55 +08:00
""" Test Qwen 2.0 VL model with LoRA """
config = TestConfig ( model_path = QWEN2VL_MODEL_PATH , lora_path = qwen2vl_lora_files )
tester = Qwen2VLTester ( config )
# Test with different LoRA IDs
for lora_id in [ 1 , 2 ] :
tester . run_test ( TEST_IMAGES , expected_outputs = EXPECTED_OUTPUTS , lora_id = lora_id )
2025-05-28 09:58:24 -06:00
def test_qwen2vl_lora_beam_search ( qwen2vl_lora_files ) :
""" Test Qwen 2.0 VL model with LoRA through beam search. """
config = TestConfig ( model_path = QWEN2VL_MODEL_PATH , lora_path = qwen2vl_lora_files )
tester = Qwen2VLTester ( config )
# Test with different LoRA IDs
for lora_id in [ 1 , 2 ] :
# NOTE currently, we only test cherry blossom since stop sign
# output is slightly different for v1; - the root cause is likely
# independent of the intent of this test, which is to ensure beam
# search passes through lora through correctly.
tester . run_beam_search_test (
[ ImageAsset ( " cherry_blossom " ) ] ,
expected_outputs = EXPECTED_BEAM_SEARCH_OUTPUTS ,
lora_id = lora_id ,
)
2025-02-20 10:37:55 +08:00
def test_qwen25vl_lora ( qwen25vl_lora_files ) :
""" Test Qwen 2.5 VL model with LoRA """
config = TestConfig ( model_path = QWEN25VL_MODEL_PATH , lora_path = qwen25vl_lora_files )
tester = Qwen2VLTester ( config )
# Test with different LoRA IDs
for lora_id in [ 1 , 2 ] :
tester . run_test ( TEST_IMAGES , expected_outputs = EXPECTED_OUTPUTS , lora_id = lora_id )
2025-12-26 20:48:20 +08:00
def test_qwen25vl_vision_lora ( qwen25vl_vision_lora_files ) :
config = TestConfig (
model_path = QWEN25VL_MODEL_PATH ,
lora_path = qwen25vl_vision_lora_files ,
# Currently, tower_connector_lora is incompatible with
# the multi-modal processor cache.
# TODO: Remove this restriction
mm_processor_cache_gb = 0 ,
enable_tower_connector_lora = True ,
)
tester = Qwen2VLTester ( config )
for lora_id in [ 1 , 2 ] :
tester . run_test (
TEST_IMAGES ,
expected_outputs = EXPECTED_OUTPUTS ,
lora_id = lora_id ,
)
def test_qwen3vl_vision_lora ( qwen3vl_vision_lora_files ) :
config = TestConfig (
model_path = QWEN3VL_MODEL_PATH ,
lora_path = qwen3vl_vision_lora_files ,
# Currently, tower_connector_lora is incompatible with
# the multi-modal processor cache.
# TODO: Remove this restriction
mm_processor_cache_gb = 0 ,
enable_tower_connector_lora = True ,
)
tester = Qwen2VLTester ( config )
for lora_id in [ 1 , 2 ] :
tester . run_test (
TEST_IMAGES ,
expected_outputs = EXPECTED_OUTPUTS ,
lora_id = lora_id ,
)
def test_qwen2vl_multiple_lora_types (
qwen2vl_language_lora_files ,
qwen2vl_vision_tower_connector_lora_files ,
qwen2vl_vision_tower_lora_files ,
) :
"""
Test multiple LoRA adapter types ( language , vision tower + connector ,
vision tower only ) using the same LLM instance to verify mm_encoder_cache
behavior with different LoRA requests .
By reusing the same LLM instance across different LoRA requests , we ensure that
the multimodal encoder cache correctly manages state transitions between
language - only and vision - enabled LoRA adapters .
"""
config = TestConfig (
model_path = QWEN2VL_MODEL_PATH ,
# We'll override the lora_path for each specific test, but need to provide
# an initial path for initialization
lora_path = qwen2vl_language_lora_files ,
# Currently, tower_connector_lora is incompatible with
# the multi-modal processor cache.
# TODO: Remove this restriction
mm_processor_cache_gb = 0 ,
enable_tower_connector_lora = True ,
)
tester = Qwen2VLTester ( config )
# Test 1: Language-only LoRA adapter
tester . config . lora_path = qwen2vl_language_lora_files
for lora_id in [ 1 , 2 ] :
tester . run_test (
TEST_IMAGES ,
expected_outputs = EXPECTED_OUTPUTS_LANGUAGE ,
lora_id = lora_id ,
lora_name = " language_only " ,
)
# Test 2: Vision tower + connector LoRA adapter
tester . config . lora_path = qwen2vl_vision_tower_connector_lora_files
for lora_id in [ 3 , 4 ] :
tester . run_test (
TEST_IMAGES ,
expected_outputs = EXPECTED_OUTPUTS_VISION ,
lora_id = lora_id ,
lora_name = " vision_tower_connector " ,
)
# Test 3: Vision tower only LoRA adapter (no connector)
tester . config . lora_path = qwen2vl_vision_tower_lora_files
for lora_id in [ 5 , 6 ] :
tester . run_test (
TEST_IMAGES ,
expected_outputs = EXPECTED_OUTPUTS_VISION_NO_CONNECTOR ,
lora_id = lora_id ,
lora_name = " vision_tower " ,
)