[core] Add tags parameter to wake_up() (#15500)
Signed-off-by: Eric <erictang000@gmail.com>
This commit is contained in:
@@ -1200,26 +1200,35 @@ class LLM:
|
||||
The caller should guarantee that no requests are being processed
|
||||
during the sleep period, before `wake_up` is called.
|
||||
|
||||
:param level: The sleep level. Level 1 sleep will offload the model
|
||||
weights and discard the kv cache. The content of kv cache is
|
||||
forgotten. Level 1 sleep is good for sleeping and waking up the
|
||||
engine to run the same model again. The model weights are backed
|
||||
up in CPU memory. Please make sure there's enough CPU memory to
|
||||
store the model weights. Level 2 sleep will discard both the model
|
||||
weights and the kv cache. The content of both the model weights
|
||||
and kv cache is forgotten. Level 2 sleep is good for sleeping and
|
||||
waking up the engine to run a different model or update the model,
|
||||
where previous model weights are not needed. It reduces CPU memory
|
||||
pressure.
|
||||
Args:
|
||||
level: The sleep level. Level 1 sleep will offload the model
|
||||
weights and discard the kv cache. The content of kv cache
|
||||
is forgotten. Level 1 sleep is good for sleeping and waking
|
||||
up the engine to run the same model again. The model weights
|
||||
are backed up in CPU memory. Please make sure there's enough
|
||||
CPU memory to store the model weights. Level 2 sleep will
|
||||
discard both the model weights and the kv cache. The content
|
||||
of both the model weights and kv cache is forgotten. Level 2
|
||||
sleep is good for sleeping and waking up the engine to run a
|
||||
different model or update the model, where previous model
|
||||
weights are not needed. It reduces CPU memory pressure.
|
||||
"""
|
||||
self.reset_prefix_cache()
|
||||
self.llm_engine.sleep(level=level)
|
||||
|
||||
def wake_up(self):
|
||||
def wake_up(self, tags: Optional[list[str]] = None):
|
||||
"""
|
||||
Wake up the engine from sleep mode. See the :meth:`sleep` method
|
||||
for more details."""
|
||||
self.llm_engine.wake_up()
|
||||
for more details.
|
||||
|
||||
Args:
|
||||
tags: An optional list of tags to reallocate the engine memory
|
||||
for specific memory allocations. Values must be in
|
||||
("weights", "kv_cache",). If None, all memory is reallocated.
|
||||
wake_up should be called with all tags (or None) before the
|
||||
engine is used again.
|
||||
"""
|
||||
self.llm_engine.wake_up(tags)
|
||||
|
||||
# LEGACY
|
||||
def _convert_v1_inputs(
|
||||
|
||||
@@ -705,7 +705,6 @@ if envs.VLLM_SERVER_DEV_MODE:
|
||||
async def sleep(raw_request: Request):
|
||||
# get POST params
|
||||
level = raw_request.query_params.get("level", "1")
|
||||
logger.info("sleep the engine with level %s", level)
|
||||
await engine_client(raw_request).sleep(int(level))
|
||||
# FIXME: in v0 with frontend multiprocessing, the sleep command
|
||||
# is sent but does not finish yet when we return a response.
|
||||
@@ -713,8 +712,12 @@ if envs.VLLM_SERVER_DEV_MODE:
|
||||
|
||||
@router.post("/wake_up")
|
||||
async def wake_up(raw_request: Request):
|
||||
logger.info("wake up the engine")
|
||||
await engine_client(raw_request).wake_up()
|
||||
tags = raw_request.query_params.getlist("tags")
|
||||
if tags == []:
|
||||
# set to None to wake up all tags if no tags are provided
|
||||
tags = None
|
||||
logger.info("wake up the engine with tags: %s", tags)
|
||||
await engine_client(raw_request).wake_up(tags)
|
||||
# FIXME: in v0 with frontend multiprocessing, the wake-up command
|
||||
# is sent but does not finish yet when we return a response.
|
||||
return Response(status_code=200)
|
||||
|
||||
Reference in New Issue
Block a user