[Feat] allow inplace loading lora (#31326)

Signed-off-by: Jackmin801 <ongjackm@gmail.com> Signed-off-by: Jackmin801 <56836461+Jackmin801@users.noreply.github.com> Co-authored-by: Jee Jee Li <pandaleefree@gmail.com>
2026-01-19 18:15:20 -08:00
parent 05dc4bfab6
commit 12dab78f49
10 changed files with 262 additions and 7 deletions
--- a/tests/lora/conftest.py
+++ b/tests/lora/conftest.py
@@ -233,6 +233,18 @@ def qwen3vl_vision_lora_files():
    return snapshot_download(repo_id="EpochEcho/qwen3-4b-vl-lora-vision-connector")


+@pytest.fixture(scope="session")
+def qwen3_meowing_lora_files():
+    """Download Qwen3 Meow LoRA files once per test session."""
+    return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Meow-LoRA")
+
+
+@pytest.fixture(scope="session")
+def qwen3_woofing_lora_files():
+    """Download Qwen3 Woof LoRA files once per test session."""
+    return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Woof-LoRA")
+
+
@pytest.fixture(scope="session")
 def tinyllama_lora_files():
    return snapshot_download(repo_id="jashing/tinyllama-colorist-lora")
--- a/tests/lora/test_llm_with_multi_loras.py
+++ b/tests/lora/test_llm_with_multi_loras.py
@@ -30,9 +30,11 @@ LORA_TEST_EXPECTED = [
 ]


-def format_chatml_messages(prompt: str):
+def format_chatml_messages(
+    prompt: str, system_prompt: str = "You are a helpful assistant."
+) -> list[dict[str, str]]:
    return [
-        {"role": "system", "content": "You are a helpful assistant."},
+        {"role": "system", "content": system_prompt},
        {"role": "user", "content": prompt},
    ]

@@ -185,3 +187,110 @@ def test_multiple_lora_requests():
    single_lora_request = lora_request[0]
    outputs = llm.generate(PROMPTS, lora_request=single_lora_request)
    assert len(PROMPTS) == len(outputs)
+
+
+def test_load_inplace_offline_reload(
+    qwen3_meowing_lora_files: str, qwen3_woofing_lora_files: str
+) -> None:
+    """
+    Test that load_inplace=True allows reloading LoRA adapters with the same ID
+    in offline mode (using LLM class directly).
+    """
+    llm = LLM(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=LORA_RANK,
+        max_model_len=512,
+        gpu_memory_utilization=0.5,
+        enforce_eager=True,
+    )
+    adapter_id = 1
+    messages = format_chatml_messages(
+        "Make your favorite animal noise.",
+        system_prompt="Follow the instructions to make animal noises",
+    )
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+
+    # Load meowing LoRA with load_inplace=True
+    meowing_request = LoRARequest(
+        lora_name="test-adapter",
+        lora_int_id=adapter_id,
+        lora_path=qwen3_meowing_lora_files,
+    )
+
+    outputs = llm.chat([messages], sampling_params, lora_request=meowing_request)
+    first_output = outputs[0].outputs[0].text.strip()
+    assert "Meow Meow Meow" in first_output, (
+        f"Expected meowing output, got: {first_output}"
+    )
+
+    # Reload with woofing LoRA (same ID, different weights, load_inplace=True)
+    woofing_request = LoRARequest(
+        lora_name="test-adapter-woof",
+        lora_int_id=adapter_id,  # Same ID
+        lora_path=qwen3_woofing_lora_files,  # Different weights
+        load_inplace=True,  # Force reload
+    )
+
+    outputs = llm.chat([messages], sampling_params, lora_request=woofing_request)
+    second_output = outputs[0].outputs[0].text.strip()
+    assert "Woof Woof Woof" in second_output, (
+        f"Expected woofing output, got: {second_output}"
+    )
+
+
+def test_load_inplace_false_no_reload(
+    qwen3_meowing_lora_files: str, qwen3_woofing_lora_files: str
+) -> None:
+    """
+    Test that load_inplace=False prevents reloading when an adapter
+    with the same ID already exists.
+    """
+    llm = LLM(
+        model=MODEL_PATH,
+        enable_lora=True,
+        max_loras=2,
+        max_lora_rank=LORA_RANK,
+        max_model_len=512,
+        gpu_memory_utilization=0.5,
+        enforce_eager=True,
+    )
+    adapter_id = 2
+    messages = format_chatml_messages(
+        "Make your favorite animal noise.",
+        system_prompt="Follow the instructions to make animal noises",
+    )
+    sampling_params = SamplingParams(temperature=0, max_tokens=10)
+
+    # Load meowing LoRA first with load_inplace=True
+    meowing_request_initial = LoRARequest(
+        lora_name="test-adapter-2",
+        lora_int_id=adapter_id,
+        lora_path=qwen3_meowing_lora_files,
+    )
+
+    outputs = llm.chat(
+        [messages], sampling_params, lora_request=meowing_request_initial
+    )
+    first_output = outputs[0].outputs[0].text.strip()
+    assert "Meow Meow Meow" in first_output, (
+        f"Expected meowing output, got: {first_output}"
+    )
+
+    # Try to load woofing LoRA with same ID but load_inplace=False
+    # This should NOT reload (adapter 2 already exists)
+    woofing_request_no_reload = LoRARequest(
+        lora_name="test-adapter-2-woof",
+        lora_int_id=adapter_id,  # Same ID
+        lora_path=qwen3_woofing_lora_files,
+    )
+
+    outputs = llm.chat(
+        [messages], sampling_params, lora_request=woofing_request_no_reload
+    )
+    second_output = outputs[0].outputs[0].text.strip()
+    # Should still get meowing output because it didn't reload
+    assert "Meow Meow Meow" in second_output, (
+        f"Expected meowing output (no reload), got: {second_output}"
+    )