diff --git a/docs/features/lora.md b/docs/features/lora.md index dda6b4768..1a30ad7b0 100644 --- a/docs/features/lora.md +++ b/docs/features/lora.md @@ -210,6 +210,24 @@ Alternatively, follow these example steps to implement your own plugin: For more details, refer to the [vLLM's Plugins System](../design/plugin_system.md). +### In-Place LoRA Reloading + +When dynamically loading LoRA adapters, you may need to replace an existing adapter with updated weights while keeping the same name. The `load_inplace` parameter enables this functionality. This commonly occurs in asynchronous reinforcement learning setups, where adapters are continuously updated and swapped in without interrupting ongoing inference. + +When `load_inplace=True`, vLLM will replace the existing adapter with the new one. + +Example request to load or replace a LoRA adapter with the same name: + +```bash +curl -X POST http://localhost:8000/v1/load_lora_adapter \ +-H "Content-Type: application/json" \ +-d '{ + "lora_name": "my-adapter", + "lora_path": "/path/to/adapter/v2", + "load_inplace": true +}' +``` + ## New format for `--lora-modules` In the previous version, users would provide LoRA modules via the following format, either as a key-value pair or in JSON format. For example: diff --git a/tests/entrypoints/conftest.py b/tests/entrypoints/conftest.py index 9ab50c44a..c2e9a1de3 100644 --- a/tests/entrypoints/conftest.py +++ b/tests/entrypoints/conftest.py @@ -195,6 +195,22 @@ def qwen3_lora_files(): return snapshot_download(repo_id="charent/self_cognition_Alice") +@pytest.fixture(scope="session") +def qwen3_meowing_lora_files(): + """Download Qwen3 LoRA files once per test session.""" + from huggingface_hub import snapshot_download + + return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Meow-LoRA") + + +@pytest.fixture(scope="session") +def qwen3_woofing_lora_files(): + """Download Qwen3 LoRA files once per test session.""" + from huggingface_hub import snapshot_download + + return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Woof-LoRA") + + @pytest.fixture(scope="session") def opt125_lora_files() -> str: """Download opt-125m LoRA files once per test session.""" diff --git a/tests/entrypoints/openai/test_lora_adapters.py b/tests/entrypoints/openai/test_lora_adapters.py index 22461f470..aa664f6d7 100644 --- a/tests/entrypoints/openai/test_lora_adapters.py +++ b/tests/entrypoints/openai/test_lora_adapters.py @@ -104,6 +104,82 @@ async def test_dynamic_lora_lineage(client: openai.AsyncOpenAI, qwen3_lora_files assert dynamic_lora_model.id == "qwen3-lora-3" +@pytest.mark.asyncio +async def test_load_lora_adapter_with_same_name_replaces_inplace( + client: openai.AsyncOpenAI, qwen3_meowing_lora_files, qwen3_woofing_lora_files +): + """Test that loading a LoRA adapter with the same name replaces it inplace.""" + adapter_name = "replaceable-adapter" + messages = [ + {"content": "Follow the instructions to make animal noises", "role": "system"}, + {"content": "Make your favorite animal noise.", "role": "user"}, + ] + + # Load LoRA that makes model meow + response = await client.post( + "load_lora_adapter", + cast_to=str, + body={"lora_name": adapter_name, "lora_path": qwen3_meowing_lora_files}, + ) + assert "success" in response.lower() + + completion = await client.chat.completions.create( + model=adapter_name, + messages=messages, + max_tokens=10, + ) + assert "Meow Meow Meow" in completion.choices[0].message.content + + # Load LoRA that makes model woof + response = await client.post( + "load_lora_adapter", + cast_to=str, + body={ + "lora_name": adapter_name, + "lora_path": qwen3_woofing_lora_files, + "load_inplace": True, + }, + ) + assert "success" in response.lower() + + completion = await client.chat.completions.create( + model=adapter_name, + messages=messages, + max_tokens=10, + ) + assert "Woof Woof Woof" in completion.choices[0].message.content + + +@pytest.mark.asyncio +async def test_load_lora_adapter_with_load_inplace_false_errors( + client: openai.AsyncOpenAI, qwen3_meowing_lora_files +): + """Test that load_inplace=False returns an error when adapter already exists.""" + adapter_name = "test-load-inplace-false" + + # Load LoRA adapter first time (should succeed) + response = await client.post( + "load_lora_adapter", + cast_to=str, + body={"lora_name": adapter_name, "lora_path": qwen3_meowing_lora_files}, + ) + assert "success" in response.lower() + + # Try to load the same adapter again with load_inplace=False (should fail) + with pytest.raises(openai.BadRequestError) as exc_info: + await client.post( + "load_lora_adapter", + cast_to=str, + body={ + "lora_name": adapter_name, + "lora_path": qwen3_meowing_lora_files, + }, + ) + + # Verify the error message + assert "already been loaded" in str(exc_info.value) + + @pytest.mark.asyncio async def test_dynamic_lora_not_found(client: openai.AsyncOpenAI): with pytest.raises(openai.NotFoundError): diff --git a/tests/lora/conftest.py b/tests/lora/conftest.py index 928d64827..deb1ab92d 100644 --- a/tests/lora/conftest.py +++ b/tests/lora/conftest.py @@ -233,6 +233,18 @@ def qwen3vl_vision_lora_files(): return snapshot_download(repo_id="EpochEcho/qwen3-4b-vl-lora-vision-connector") +@pytest.fixture(scope="session") +def qwen3_meowing_lora_files(): + """Download Qwen3 Meow LoRA files once per test session.""" + return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Meow-LoRA") + + +@pytest.fixture(scope="session") +def qwen3_woofing_lora_files(): + """Download Qwen3 Woof LoRA files once per test session.""" + return snapshot_download(repo_id="Jackmin108/Qwen3-0.6B-Woof-LoRA") + + @pytest.fixture(scope="session") def tinyllama_lora_files(): return snapshot_download(repo_id="jashing/tinyllama-colorist-lora") diff --git a/tests/lora/test_llm_with_multi_loras.py b/tests/lora/test_llm_with_multi_loras.py index 269a1ade7..56bac026b 100644 --- a/tests/lora/test_llm_with_multi_loras.py +++ b/tests/lora/test_llm_with_multi_loras.py @@ -30,9 +30,11 @@ LORA_TEST_EXPECTED = [ ] -def format_chatml_messages(prompt: str): +def format_chatml_messages( + prompt: str, system_prompt: str = "You are a helpful assistant." +) -> list[dict[str, str]]: return [ - {"role": "system", "content": "You are a helpful assistant."}, + {"role": "system", "content": system_prompt}, {"role": "user", "content": prompt}, ] @@ -185,3 +187,110 @@ def test_multiple_lora_requests(): single_lora_request = lora_request[0] outputs = llm.generate(PROMPTS, lora_request=single_lora_request) assert len(PROMPTS) == len(outputs) + + +def test_load_inplace_offline_reload( + qwen3_meowing_lora_files: str, qwen3_woofing_lora_files: str +) -> None: + """ + Test that load_inplace=True allows reloading LoRA adapters with the same ID + in offline mode (using LLM class directly). + """ + llm = LLM( + model=MODEL_PATH, + enable_lora=True, + max_loras=2, + max_lora_rank=LORA_RANK, + max_model_len=512, + gpu_memory_utilization=0.5, + enforce_eager=True, + ) + adapter_id = 1 + messages = format_chatml_messages( + "Make your favorite animal noise.", + system_prompt="Follow the instructions to make animal noises", + ) + sampling_params = SamplingParams(temperature=0, max_tokens=10) + + # Load meowing LoRA with load_inplace=True + meowing_request = LoRARequest( + lora_name="test-adapter", + lora_int_id=adapter_id, + lora_path=qwen3_meowing_lora_files, + ) + + outputs = llm.chat([messages], sampling_params, lora_request=meowing_request) + first_output = outputs[0].outputs[0].text.strip() + assert "Meow Meow Meow" in first_output, ( + f"Expected meowing output, got: {first_output}" + ) + + # Reload with woofing LoRA (same ID, different weights, load_inplace=True) + woofing_request = LoRARequest( + lora_name="test-adapter-woof", + lora_int_id=adapter_id, # Same ID + lora_path=qwen3_woofing_lora_files, # Different weights + load_inplace=True, # Force reload + ) + + outputs = llm.chat([messages], sampling_params, lora_request=woofing_request) + second_output = outputs[0].outputs[0].text.strip() + assert "Woof Woof Woof" in second_output, ( + f"Expected woofing output, got: {second_output}" + ) + + +def test_load_inplace_false_no_reload( + qwen3_meowing_lora_files: str, qwen3_woofing_lora_files: str +) -> None: + """ + Test that load_inplace=False prevents reloading when an adapter + with the same ID already exists. + """ + llm = LLM( + model=MODEL_PATH, + enable_lora=True, + max_loras=2, + max_lora_rank=LORA_RANK, + max_model_len=512, + gpu_memory_utilization=0.5, + enforce_eager=True, + ) + adapter_id = 2 + messages = format_chatml_messages( + "Make your favorite animal noise.", + system_prompt="Follow the instructions to make animal noises", + ) + sampling_params = SamplingParams(temperature=0, max_tokens=10) + + # Load meowing LoRA first with load_inplace=True + meowing_request_initial = LoRARequest( + lora_name="test-adapter-2", + lora_int_id=adapter_id, + lora_path=qwen3_meowing_lora_files, + ) + + outputs = llm.chat( + [messages], sampling_params, lora_request=meowing_request_initial + ) + first_output = outputs[0].outputs[0].text.strip() + assert "Meow Meow Meow" in first_output, ( + f"Expected meowing output, got: {first_output}" + ) + + # Try to load woofing LoRA with same ID but load_inplace=False + # This should NOT reload (adapter 2 already exists) + woofing_request_no_reload = LoRARequest( + lora_name="test-adapter-2-woof", + lora_int_id=adapter_id, # Same ID + lora_path=qwen3_woofing_lora_files, + ) + + outputs = llm.chat( + [messages], sampling_params, lora_request=woofing_request_no_reload + ) + second_output = outputs[0].outputs[0].text.strip() + # Should still get meowing output because it didn't reload + assert "Meow Meow Meow" in second_output, ( + f"Expected meowing output (no reload), got: {second_output}" + ) diff --git a/vllm/entrypoints/openai/models/serving.py b/vllm/entrypoints/openai/models/serving.py index 2d8cf8f33..a4b92e5ec 100644 --- a/vllm/entrypoints/openai/models/serving.py +++ b/vllm/entrypoints/openai/models/serving.py @@ -132,9 +132,16 @@ class OpenAIServingModels: return error_check_ret lora_path = request.lora_path - unique_id = self.lora_id_counter.inc(1) + lora_int_id = ( + self.lora_requests[lora_name].lora_int_id + if lora_name in self.lora_requests + else self.lora_id_counter.inc(1) + ) lora_request = LoRARequest( - lora_name=lora_name, lora_int_id=unique_id, lora_path=lora_path + lora_name=lora_name, + lora_int_id=lora_int_id, + lora_path=lora_path, + load_inplace=request.load_inplace, ) if base_model_name is not None and self.is_base_model(base_model_name): lora_request.base_model_name = base_model_name @@ -187,11 +194,13 @@ class OpenAIServingModels: status_code=HTTPStatus.BAD_REQUEST, ) + # If not loading inplace # Check if the lora adapter with the given name already exists - if request.lora_name in self.lora_requests: + if not request.load_inplace and request.lora_name in self.lora_requests: return create_error_response( message=f"The lora adapter '{request.lora_name}' has already been " - "loaded.", + "loaded. If you want to load the adapter in place, set 'load_inplace'" + " to True.", err_type="InvalidUserInput", status_code=HTTPStatus.BAD_REQUEST, ) diff --git a/vllm/entrypoints/serve/lora/api_router.py b/vllm/entrypoints/serve/lora/api_router.py index 51bfc755f..057bf5c2e 100644 --- a/vllm/entrypoints/serve/lora/api_router.py +++ b/vllm/entrypoints/serve/lora/api_router.py @@ -36,6 +36,7 @@ def attach_router(app: FastAPI): request_shape={ "lora_name": "body.name", "lora_path": "body.src", + "load_inplace": "body.load_inplace || `false`", }, ) @router.post("/v1/load_lora_adapter", dependencies=[Depends(validate_json_request)]) diff --git a/vllm/entrypoints/serve/lora/protocol.py b/vllm/entrypoints/serve/lora/protocol.py index e39f35f38..3e3a30cf3 100644 --- a/vllm/entrypoints/serve/lora/protocol.py +++ b/vllm/entrypoints/serve/lora/protocol.py @@ -7,6 +7,7 @@ from pydantic import BaseModel, Field class LoadLoRAAdapterRequest(BaseModel): lora_name: str lora_path: str + load_inplace: bool = False class UnloadLoRAAdapterRequest(BaseModel): diff --git a/vllm/lora/request.py b/vllm/lora/request.py index 2811fee1d..008ade5e5 100644 --- a/vllm/lora/request.py +++ b/vllm/lora/request.py @@ -15,6 +15,11 @@ class LoRARequest( lora_int_id must be globally unique for a given adapter. This is currently not enforced in vLLM. + + load_inplace: If True, forces reloading the adapter even if one + with the same lora_int_id already exists in the cache. This replaces + the existing adapter in-place. If False (default), only loads if the + adapter is not already loaded. """ lora_name: str @@ -22,6 +27,7 @@ class LoRARequest( lora_path: str = "" base_model_name: str | None = msgspec.field(default=None) tensorizer_config_dict: dict | None = None + load_inplace: bool = False def __post_init__(self): if self.lora_int_id < 1: diff --git a/vllm/lora/worker_manager.py b/vllm/lora/worker_manager.py index 277e462a3..598c10407 100644 --- a/vllm/lora/worker_manager.py +++ b/vllm/lora/worker_manager.py @@ -254,13 +254,20 @@ class LRUCacheWorkerLoRAManager(WorkerLoRAManager): # This is ok because it's currently only called from # the single-threaded core engine loop. - if lora_request.lora_int_id not in self.list_adapters(): + if ( + lora_request.lora_int_id not in self.list_adapters() + or lora_request.load_inplace + ): # Load the new adapter first to ensure it is actually valid, before # evicting any existing adapters. # This may cause the # of loaded lora adapters to very temporarily # exceed `--max-cpu-loras`. lora = self._load_adapter(lora_request) + # Remove the existing adapter if it exists + # Use case for LoRA inplace + self._adapter_manager.remove_adapter(lora.id) + # Loading succeeded, now check if we will exceed cache capacity and # evict if the oldest adapter if so if len(self._adapter_manager) + 1 > self._adapter_manager.capacity: