[Feat] allow inplace loading lora (#31326)

Signed-off-by: Jackmin801 <ongjackm@gmail.com>
Signed-off-by: Jackmin801 <56836461+Jackmin801@users.noreply.github.com>
Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
Jackmin801
2026-01-19 18:15:20 -08:00
committed by GitHub
parent 05dc4bfab6
commit 12dab78f49
10 changed files with 262 additions and 7 deletions

View File

@@ -233,6 +233,18 @@ def qwen3vl_vision_lora_files():
return snapshot_download(repo_id="EpochEcho/qwen3-4b-vl-lora-vision-connector")
@pytest.fixture(scope="session")
def qwen3_meowing_lora_files():
"""Download Qwen3 Meow LoRA files once per test session."""
return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Meow-LoRA")
@pytest.fixture(scope="session")
def qwen3_woofing_lora_files():
"""Download Qwen3 Woof LoRA files once per test session."""
return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Woof-LoRA")
@pytest.fixture(scope="session")
def tinyllama_lora_files():
return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")

View File

@@ -30,9 +30,11 @@ LORA_TEST_EXPECTED = [
]
def format_chatml_messages(prompt: str):
def format_chatml_messages(
prompt: str, system_prompt: str = "You are a helpful assistant."
) -> list[dict[str, str]]:
return [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "system", "content": system_prompt},
{"role": "user", "content": prompt},
]
@@ -185,3 +187,110 @@ def test_multiple_lora_requests():
single_lora_request = lora_request[0]
outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
assert len(PROMPTS) == len(outputs)
def test_load_inplace_offline_reload(
qwen3_meowing_lora_files: str, qwen3_woofing_lora_files: str
) -> None:
"""
Test that load_inplace=True allows reloading LoRA adapters with the same ID
in offline mode (using LLM class directly).
"""
llm = LLM(
model=MODEL_PATH,
enable_lora=True,
max_loras=2,
max_lora_rank=LORA_RANK,
max_model_len=512,
gpu_memory_utilization=0.5,
enforce_eager=True,
)
adapter_id = 1
messages = format_chatml_messages(
"Make your favorite animal noise.",
system_prompt="Follow the instructions to make animal noises",
)
sampling_params = SamplingParams(temperature=0, max_tokens=10)
# Load meowing LoRA with load_inplace=True
meowing_request = LoRARequest(
lora_name="test-adapter",
lora_int_id=adapter_id,
lora_path=qwen3_meowing_lora_files,
)
outputs = llm.chat([messages], sampling_params, lora_request=meowing_request)
first_output = outputs[0].outputs[0].text.strip()
assert "Meow Meow Meow" in first_output, (
f"Expected meowing output, got: {first_output}"
)
# Reload with woofing LoRA (same ID, different weights, load_inplace=True)
woofing_request = LoRARequest(
lora_name="test-adapter-woof",
lora_int_id=adapter_id, # Same ID
lora_path=qwen3_woofing_lora_files, # Different weights
load_inplace=True, # Force reload
)
outputs = llm.chat([messages], sampling_params, lora_request=woofing_request)
second_output = outputs[0].outputs[0].text.strip()
assert "Woof Woof Woof" in second_output, (
f"Expected woofing output, got: {second_output}"
)
def test_load_inplace_false_no_reload(
qwen3_meowing_lora_files: str, qwen3_woofing_lora_files: str
) -> None:
"""
Test that load_inplace=False prevents reloading when an adapter
with the same ID already exists.
"""
llm = LLM(
model=MODEL_PATH,
enable_lora=True,
max_loras=2,
max_lora_rank=LORA_RANK,
max_model_len=512,
gpu_memory_utilization=0.5,
enforce_eager=True,
)
adapter_id = 2
messages = format_chatml_messages(
"Make your favorite animal noise.",
system_prompt="Follow the instructions to make animal noises",
)
sampling_params = SamplingParams(temperature=0, max_tokens=10)
# Load meowing LoRA first with load_inplace=True
meowing_request_initial = LoRARequest(
lora_name="test-adapter-2",
lora_int_id=adapter_id,
lora_path=qwen3_meowing_lora_files,
)
outputs = llm.chat(
[messages], sampling_params, lora_request=meowing_request_initial
)
first_output = outputs[0].outputs[0].text.strip()
assert "Meow Meow Meow" in first_output, (
f"Expected meowing output, got: {first_output}"
)
# Try to load woofing LoRA with same ID but load_inplace=False
# This should NOT reload (adapter 2 already exists)
woofing_request_no_reload = LoRARequest(
lora_name="test-adapter-2-woof",
lora_int_id=adapter_id, # Same ID
lora_path=qwen3_woofing_lora_files,
)
outputs = llm.chat(
[messages], sampling_params, lora_request=woofing_request_no_reload
)
second_output = outputs[0].outputs[0].text.strip()
# Should still get meowing output because it didn't reload
assert "Meow Meow Meow" in second_output, (
f"Expected meowing output (no reload), got: {second_output}"
)