[V1] LoRA - Enable Serving Usecase (#12883)

Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
This commit is contained in:
Varun Sundar Rabindranath
2025-02-14 11:51:12 +05:30
committed by GitHub
parent f0b2da72a8
commit cbc40128eb
7 changed files with 210 additions and 7 deletions

View File

@@ -13,6 +13,7 @@ import zmq.asyncio
from vllm.config import VllmConfig
from vllm.logger import init_logger
from vllm.lora.request import LoRARequest
from vllm.transformers_utils.config import (
maybe_register_config_serialize_by_value)
from vllm.utils import get_exception_traceback, zmq_socket_ctx
@@ -146,6 +147,9 @@ class EngineCore:
def reset_prefix_cache(self):
self.scheduler.reset_prefix_cache()
def add_lora(self, lora_request: LoRARequest) -> None:
self.model_executor.add_lora(lora_request)
class EngineCoreProc(EngineCore):
"""ZMQ-wrapper for running EngineCore in background process."""
@@ -262,12 +266,15 @@ class EngineCoreProc(EngineCore):
self.reset_prefix_cache()
elif request_type == EngineCoreRequestType.PROFILE:
self.model_executor.profile(request)
elif request_type == EngineCoreRequestType.ADD_LORA:
self.model_executor.add_lora(request)
def process_input_socket(self, input_path: str):
"""Input socket IO thread."""
# Msgpack serialization decoding.
add_request_decoder = MsgpackDecoder(EngineCoreRequest)
add_lora_decoder = MsgpackDecoder(LoRARequest)
generic_decoder = MsgpackDecoder()
with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
@@ -277,9 +284,14 @@ class EngineCoreProc(EngineCore):
request_type = EngineCoreRequestType(bytes(type_frame.buffer))
# Deserialize the request data.
decoder = add_request_decoder if (
request_type
== EngineCoreRequestType.ADD) else generic_decoder
decoder = None
if request_type == EngineCoreRequestType.ADD:
decoder = add_request_decoder
elif request_type == EngineCoreRequestType.ADD_LORA:
decoder = add_lora_decoder
else:
decoder = generic_decoder
request = decoder.decode(data_frame.buffer)
# Push to input queue for core busy loop.