[Core] Initialize LoRA support for tower and connector in multi-modal models (#26674)

Signed-off-by: bk-201 <joy25810@foxmail.com>
Signed-off-by: Jee Jee Li <pandaleefree@gmail.com>
Signed-off-by: prashanth058 <prashanth.dannamaneni@uipath.com>
Co-authored-by: bk-201 <joy25810@foxmail.com>
Co-authored-by: prashanth058 <prashanth.dannamaneni@uipath.com>
Co-authored-by: Anexdeus <5142168@mail.ru>
This commit is contained in:
Jee Jee Li
2025-12-26 20:48:20 +08:00
committed by GitHub
parent 0b544e6476
commit ce1eafd1a5
20 changed files with 635 additions and 80 deletions

View File

@@ -208,6 +208,31 @@ def qwen25vl_lora_files():
return snapshot_download(repo_id="jeeejeee/qwen25-vl-lora-pokemon")
@pytest.fixture(scope="session")
def qwen2vl_language_lora_files():
return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-language")
@pytest.fixture(scope="session")
def qwen2vl_vision_tower_connector_lora_files():
return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower-connector")
@pytest.fixture(scope="session")
def qwen2vl_vision_tower_lora_files():
return snapshot_download(repo_id="prashanth058/qwen2vl-flickr-lora-tower")
@pytest.fixture(scope="session")
def qwen25vl_vision_lora_files():
return snapshot_download(repo_id="EpochEcho/qwen2.5-3b-vl-lora-vision-connector")
@pytest.fixture(scope="session")
def qwen3vl_vision_lora_files():
return snapshot_download(repo_id="EpochEcho/qwen3-4b-vl-lora-vision-connector")
@pytest.fixture(scope="session")
def tinyllama_lora_files():
return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")

View File

@@ -18,6 +18,7 @@ from vllm.lora.layers import (
from vllm.lora.lora_model import LoRAModel
from vllm.lora.lora_weights import LoRALayerWeights, PackedLoRALayerWeights
from vllm.lora.model_manager import (
DEFAULT_LANGUAGE_WRAPPER_KEY,
LoRAMapping,
LoRAModelManager,
LRUCacheLoRAModelManager,
@@ -183,9 +184,11 @@ def test_lora_model_manager(dist_init, dummy_model, device):
assert manager.activate_adapter(2)
assert manager.lora_index_to_id[0] == 3
assert manager.lora_index_to_id[1] == 2
assert manager.device == device
assert manager.punica_wrapper.device == device
assert (
manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
== device
)
assert hasattr(manager, "supported_lora_modules")
assert sorted(manager.supported_lora_modules) == [
"dense1",
@@ -278,8 +281,10 @@ def test_lora_lru_cache_model_manager(dist_init, dummy_model, device):
assert manager.remove_adapter(3)
with pytest.raises(ValueError):
assert manager.pin_adapter(3)
assert manager.punica_wrapper.device == device
assert (
manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
== device
)
assert manager.device == device
@@ -402,7 +407,10 @@ def test_lru_lora_model_manager(dist_init, dummy_model, device):
assert manager.remove_oldest_adapter()
assert set(manager.list_adapters()) == {1}
assert manager.punica_wrapper.device == device
assert (
manager.punica_wrapper_mapping.get(DEFAULT_LANGUAGE_WRAPPER_KEY).device
== device
)
assert manager.device == device
@@ -514,7 +522,10 @@ def test_lru_cache_worker_adapter_manager(dist_init, dummy_model, device, tmp_pa
)
assert worker_adapter_manager.device == device
assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
punica_wrapper = worker_adapter_manager._adapter_manager.punica_wrapper_mapping.get(
DEFAULT_LANGUAGE_WRAPPER_KEY
)
assert punica_wrapper.device == device
@pytest.mark.parametrize("device", DEVICES)
@@ -618,7 +629,10 @@ def test_worker_adapter_manager(dist_init, dummy_model_gate_up, device, tmp_path
)
assert worker_adapter_manager.device == device
assert worker_adapter_manager._adapter_manager.punica_wrapper.device == device
punica_wrapper = worker_adapter_manager._adapter_manager.punica_wrapper_mapping.get(
DEFAULT_LANGUAGE_WRAPPER_KEY
)
assert punica_wrapper.device == device
@pytest.mark.parametrize("device", DEVICES)

View File

@@ -14,9 +14,12 @@ class TestConfig:
lora_path: str
max_num_seqs: int = 2
max_loras: int = 2
max_lora_rank: int = 16
max_model_len: int = 4096
max_lora_rank: int = 32
enable_tower_connector_lora: bool = False
max_model_len: int = 8192
gpu_memory_utilization: float = 0.85
mm_processor_kwargs: dict[str, int] | None = None
mm_processor_cache_gb: float = 4
def __post_init__(self):
if self.mm_processor_kwargs is None:
@@ -48,8 +51,11 @@ class Qwen2VLTester:
enable_lora=True,
max_loras=self.config.max_loras,
max_lora_rank=self.config.max_lora_rank,
enable_tower_connector_lora=self.config.enable_tower_connector_lora,
trust_remote_code=True,
gpu_memory_utilization=self.config.gpu_memory_utilization,
mm_processor_kwargs=self.config.mm_processor_kwargs,
mm_processor_cache_gb=self.config.mm_processor_cache_gb,
max_model_len=self.config.max_model_len,
)
@@ -58,6 +64,7 @@ class Qwen2VLTester:
images: list[ImageAsset],
expected_outputs: list[str],
lora_id: int | None = None,
lora_name: str | None = None,
temperature: float = 0,
max_tokens: int = 5,
):
@@ -73,10 +80,11 @@ class Qwen2VLTester:
for asset in images
]
lora_request = LoRARequest(str(lora_id), lora_id, self.config.lora_path)
lora_request = LoRARequest(
lora_name if lora_name else str(lora_id), lora_id, self.config.lora_path
)
outputs = self.llm.generate(inputs, sampling_params, lora_request=lora_request)
generated_texts = [output.outputs[0].text.strip() for output in outputs]
# Validate outputs
for generated, expected in zip(generated_texts, expected_outputs):
assert expected.startswith(generated), (
@@ -127,6 +135,22 @@ EXPECTED_OUTPUTS = [
"A majestic skyscraper stands tall, partially obscured by a vibrant canopy of cherry blossoms, against a clear blue sky.", # noqa: E501
]
EXPECTED_OUTPUTS_LANGUAGE = [
"A stop sign is shown in an Asian city, with buildings and a car in the "
"background.",
"The Tokyo Skytree can be seen behind the pink blossoms of the cherry trees.",
]
EXPECTED_OUTPUTS_VISION = [
"A stop sign in front of oriental buildings.",
"A tree with pink flowers in front of it and a blue sky behind the flowers.",
]
EXPECTED_OUTPUTS_VISION_NO_CONNECTOR = [
"A stop sign is located on the street of a Chinese neighborhood.",
"A closeup shot of the Tokyo Skytree with pink flowers in the foreground.",
]
# NOTE - beam search .text contains the whole text
EXPECTED_BEAM_SEARCH_OUTPUTS = [
[
@@ -137,6 +161,7 @@ EXPECTED_BEAM_SEARCH_OUTPUTS = [
QWEN2VL_MODEL_PATH = "Qwen/Qwen2-VL-2B-Instruct"
QWEN25VL_MODEL_PATH = "Qwen/Qwen2.5-VL-3B-Instruct"
QWEN3VL_MODEL_PATH = "Qwen/Qwen3-VL-4B-Instruct"
def test_qwen2vl_lora(qwen2vl_lora_files):
@@ -175,3 +200,99 @@ def test_qwen25vl_lora(qwen25vl_lora_files):
# Test with different LoRA IDs
for lora_id in [1, 2]:
tester.run_test(TEST_IMAGES, expected_outputs=EXPECTED_OUTPUTS, lora_id=lora_id)
def test_qwen25vl_vision_lora(qwen25vl_vision_lora_files):
config = TestConfig(
model_path=QWEN25VL_MODEL_PATH,
lora_path=qwen25vl_vision_lora_files,
# Currently, tower_connector_lora is incompatible with
# the multi-modal processor cache.
# TODO: Remove this restriction
mm_processor_cache_gb=0,
enable_tower_connector_lora=True,
)
tester = Qwen2VLTester(config)
for lora_id in [1, 2]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS,
lora_id=lora_id,
)
def test_qwen3vl_vision_lora(qwen3vl_vision_lora_files):
config = TestConfig(
model_path=QWEN3VL_MODEL_PATH,
lora_path=qwen3vl_vision_lora_files,
# Currently, tower_connector_lora is incompatible with
# the multi-modal processor cache.
# TODO: Remove this restriction
mm_processor_cache_gb=0,
enable_tower_connector_lora=True,
)
tester = Qwen2VLTester(config)
for lora_id in [1, 2]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS,
lora_id=lora_id,
)
def test_qwen2vl_multiple_lora_types(
qwen2vl_language_lora_files,
qwen2vl_vision_tower_connector_lora_files,
qwen2vl_vision_tower_lora_files,
):
"""
Test multiple LoRA adapter types (language, vision tower + connector,
vision tower only) using the same LLM instance to verify mm_encoder_cache
behavior with different LoRA requests.
By reusing the same LLM instance across different LoRA requests, we ensure that
the multimodal encoder cache correctly manages state transitions between
language-only and vision-enabled LoRA adapters.
"""
config = TestConfig(
model_path=QWEN2VL_MODEL_PATH,
# We'll override the lora_path for each specific test, but need to provide
# an initial path for initialization
lora_path=qwen2vl_language_lora_files,
# Currently, tower_connector_lora is incompatible with
# the multi-modal processor cache.
# TODO: Remove this restriction
mm_processor_cache_gb=0,
enable_tower_connector_lora=True,
)
tester = Qwen2VLTester(config)
# Test 1: Language-only LoRA adapter
tester.config.lora_path = qwen2vl_language_lora_files
for lora_id in [1, 2]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS_LANGUAGE,
lora_id=lora_id,
lora_name="language_only",
)
# Test 2: Vision tower + connector LoRA adapter
tester.config.lora_path = qwen2vl_vision_tower_connector_lora_files
for lora_id in [3, 4]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS_VISION,
lora_id=lora_id,
lora_name="vision_tower_connector",
)
# Test 3: Vision tower only LoRA adapter (no connector)
tester.config.lora_path = qwen2vl_vision_tower_lora_files
for lora_id in [5, 6]:
tester.run_test(
TEST_IMAGES,
expected_outputs=EXPECTED_OUTPUTS_VISION_NO_CONNECTOR,
lora_id=lora_id,
lora_name="vision_tower",
)