[V1] LoRA - Enable Serving Usecase (#12883)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
2025-02-14 11:51:12 +05:30
parent f0b2da72a8
commit cbc40128eb
7 changed files with 210 additions and 7 deletions
--- a/vllm/v1/engine/async_llm.py
+++ b/vllm/v1/engine/async_llm.py
@@ -361,6 +361,10 @@ class AsyncLLM(EngineClient):
    async def reset_prefix_cache(self) -> None:
        await self.engine_core.reset_prefix_cache_async()

+    async def add_lora(self, lora_request: LoRARequest) -> None:
+        """Load a new LoRA adapter into the engine for future requests."""
+        await self.engine_core.add_lora_async(lora_request)
+
    @property
    def is_running(self) -> bool:
        return True
@@ -376,7 +380,3 @@ class AsyncLLM(EngineClient):
    @property
    def dead_error(self) -> BaseException:
        return Exception()  # TODO: implement
-
-    async def add_lora(self, lora_request: LoRARequest) -> None:
-        """Load a new LoRA adapter into the engine for future requests."""
-        raise NotImplementedError("LoRA not yet supported in V1")