[V1] LoRA - Enable Serving Usecase (#12883)
Signed-off-by: Varun Sundar Rabindranath <varun@neuralmagic.com> Co-authored-by: Varun Sundar Rabindranath <varun@neuralmagic.com>
This commit is contained in:
committed by
GitHub
parent
f0b2da72a8
commit
cbc40128eb
@@ -13,6 +13,7 @@ import zmq.asyncio
|
||||
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.logger import init_logger
|
||||
from vllm.lora.request import LoRARequest
|
||||
from vllm.transformers_utils.config import (
|
||||
maybe_register_config_serialize_by_value)
|
||||
from vllm.utils import get_exception_traceback, zmq_socket_ctx
|
||||
@@ -146,6 +147,9 @@ class EngineCore:
|
||||
def reset_prefix_cache(self):
|
||||
self.scheduler.reset_prefix_cache()
|
||||
|
||||
def add_lora(self, lora_request: LoRARequest) -> None:
|
||||
self.model_executor.add_lora(lora_request)
|
||||
|
||||
|
||||
class EngineCoreProc(EngineCore):
|
||||
"""ZMQ-wrapper for running EngineCore in background process."""
|
||||
@@ -262,12 +266,15 @@ class EngineCoreProc(EngineCore):
|
||||
self.reset_prefix_cache()
|
||||
elif request_type == EngineCoreRequestType.PROFILE:
|
||||
self.model_executor.profile(request)
|
||||
elif request_type == EngineCoreRequestType.ADD_LORA:
|
||||
self.model_executor.add_lora(request)
|
||||
|
||||
def process_input_socket(self, input_path: str):
|
||||
"""Input socket IO thread."""
|
||||
|
||||
# Msgpack serialization decoding.
|
||||
add_request_decoder = MsgpackDecoder(EngineCoreRequest)
|
||||
add_lora_decoder = MsgpackDecoder(LoRARequest)
|
||||
generic_decoder = MsgpackDecoder()
|
||||
|
||||
with zmq_socket_ctx(input_path, zmq.constants.PULL) as socket:
|
||||
@@ -277,9 +284,14 @@ class EngineCoreProc(EngineCore):
|
||||
request_type = EngineCoreRequestType(bytes(type_frame.buffer))
|
||||
|
||||
# Deserialize the request data.
|
||||
decoder = add_request_decoder if (
|
||||
request_type
|
||||
== EngineCoreRequestType.ADD) else generic_decoder
|
||||
decoder = None
|
||||
if request_type == EngineCoreRequestType.ADD:
|
||||
decoder = add_request_decoder
|
||||
elif request_type == EngineCoreRequestType.ADD_LORA:
|
||||
decoder = add_lora_decoder
|
||||
else:
|
||||
decoder = generic_decoder
|
||||
|
||||
request = decoder.decode(data_frame.buffer)
|
||||
|
||||
# Push to input queue for core busy loop.
|
||||
|
||||
Reference in New Issue
Block a user