[Feat] allow inplace loading lora (#31326)
Signed-off-by: Jackmin801 <ongjackm@gmail.com> Signed-off-by: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
This commit is contained in:
@@ -233,6 +233,18 @@ def qwen3vl_vision_lora_files():
|
||||
return snapshot_download(repo_id="EpochEcho/qwen3-4b-vl-lora-vision-connector")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen3_meowing_lora_files():
|
||||
"""Download Qwen3 Meow LoRA files once per test session."""
|
||||
return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Meow-LoRA")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def qwen3_woofing_lora_files():
|
||||
"""Download Qwen3 Woof LoRA files once per test session."""
|
||||
return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Woof-LoRA")
|
||||
|
||||
|
||||
@pytest.fixture(scope="session")
|
||||
def tinyllama_lora_files():
|
||||
return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
|
||||
|
||||
@@ -30,9 +30,11 @@ LORA_TEST_EXPECTED = [
|
||||
]
|
||||
|
||||
|
||||
def format_chatml_messages(prompt: str):
|
||||
def format_chatml_messages(
|
||||
prompt: str, system_prompt: str = "You are a helpful assistant."
|
||||
) -> list[dict[str, str]]:
|
||||
return [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "system", "content": system_prompt},
|
||||
{"role": "user", "content": prompt},
|
||||
]
|
||||
|
||||
@@ -185,3 +187,110 @@ def test_multiple_lora_requests():
|
||||
single_lora_request = lora_request[0]
|
||||
outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
|
||||
assert len(PROMPTS) == len(outputs)
|
||||
|
||||
|
||||
def test_load_inplace_offline_reload(
|
||||
qwen3_meowing_lora_files: str, qwen3_woofing_lora_files: str
|
||||
) -> None:
|
||||
"""
|
||||
Test that load_inplace=True allows reloading LoRA adapters with the same ID
|
||||
in offline mode (using LLM class directly).
|
||||
"""
|
||||
llm = LLM(
|
||||
model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=2,
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=512,
|
||||
gpu_memory_utilization=0.5,
|
||||
enforce_eager=True,
|
||||
)
|
||||
adapter_id = 1
|
||||
messages = format_chatml_messages(
|
||||
"Make your favorite animal noise.",
|
||||
system_prompt="Follow the instructions to make animal noises",
|
||||
)
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||
|
||||
# Load meowing LoRA with load_inplace=True
|
||||
meowing_request = LoRARequest(
|
||||
lora_name="test-adapter",
|
||||
lora_int_id=adapter_id,
|
||||
lora_path=qwen3_meowing_lora_files,
|
||||
)
|
||||
|
||||
outputs = llm.chat([messages], sampling_params, lora_request=meowing_request)
|
||||
first_output = outputs[0].outputs[0].text.strip()
|
||||
assert "Meow Meow Meow" in first_output, (
|
||||
f"Expected meowing output, got: {first_output}"
|
||||
)
|
||||
|
||||
# Reload with woofing LoRA (same ID, different weights, load_inplace=True)
|
||||
woofing_request = LoRARequest(
|
||||
lora_name="test-adapter-woof",
|
||||
lora_int_id=adapter_id, # Same ID
|
||||
lora_path=qwen3_woofing_lora_files, # Different weights
|
||||
load_inplace=True, # Force reload
|
||||
)
|
||||
|
||||
outputs = llm.chat([messages], sampling_params, lora_request=woofing_request)
|
||||
second_output = outputs[0].outputs[0].text.strip()
|
||||
assert "Woof Woof Woof" in second_output, (
|
||||
f"Expected woofing output, got: {second_output}"
|
||||
)
|
||||
|
||||
|
||||
def test_load_inplace_false_no_reload(
|
||||
qwen3_meowing_lora_files: str, qwen3_woofing_lora_files: str
|
||||
) -> None:
|
||||
"""
|
||||
Test that load_inplace=False prevents reloading when an adapter
|
||||
with the same ID already exists.
|
||||
"""
|
||||
llm = LLM(
|
||||
model=MODEL_PATH,
|
||||
enable_lora=True,
|
||||
max_loras=2,
|
||||
max_lora_rank=LORA_RANK,
|
||||
max_model_len=512,
|
||||
gpu_memory_utilization=0.5,
|
||||
enforce_eager=True,
|
||||
)
|
||||
adapter_id = 2
|
||||
messages = format_chatml_messages(
|
||||
"Make your favorite animal noise.",
|
||||
system_prompt="Follow the instructions to make animal noises",
|
||||
)
|
||||
sampling_params = SamplingParams(temperature=0, max_tokens=10)
|
||||
|
||||
# Load meowing LoRA first with load_inplace=True
|
||||
meowing_request_initial = LoRARequest(
|
||||
lora_name="test-adapter-2",
|
||||
lora_int_id=adapter_id,
|
||||
lora_path=qwen3_meowing_lora_files,
|
||||
)
|
||||
|
||||
outputs = llm.chat(
|
||||
[messages], sampling_params, lora_request=meowing_request_initial
|
||||
)
|
||||
first_output = outputs[0].outputs[0].text.strip()
|
||||
assert "Meow Meow Meow" in first_output, (
|
||||
f"Expected meowing output, got: {first_output}"
|
||||
)
|
||||
|
||||
# Try to load woofing LoRA with same ID but load_inplace=False
|
||||
# This should NOT reload (adapter 2 already exists)
|
||||
woofing_request_no_reload = LoRARequest(
|
||||
lora_name="test-adapter-2-woof",
|
||||
lora_int_id=adapter_id, # Same ID
|
||||
lora_path=qwen3_woofing_lora_files,
|
||||
)
|
||||
|
||||
outputs = llm.chat(
|
||||
[messages], sampling_params, lora_request=woofing_request_no_reload
|
||||
)
|
||||
second_output = outputs[0].outputs[0].text.strip()
|
||||
# Should still get meowing output because it didn't reload
|
||||
assert "Meow Meow Meow" in second_output, (
|
||||
f"Expected meowing output (no reload), got: {second_output}"
|
||||
)
|
||||
|
||||
Reference in New Issue
Block a user