[core] Add tags parameter to wake_up() (#15500)

Signed-off-by: Eric <erictang000@gmail.com>
This commit is contained in:
Eric Tang
2025-04-02 01:59:27 -07:00
committed by GitHub
parent 90969fb39a
commit ddb94c2605
18 changed files with 144 additions and 71 deletions

View File

@@ -1200,26 +1200,35 @@ class LLM:
The caller should guarantee that no requests are being processed
during the sleep period, before `wake_up` is called.
:param level: The sleep level. Level 1 sleep will offload the model
weights and discard the kv cache. The content of kv cache is
forgotten. Level 1 sleep is good for sleeping and waking up the
engine to run the same model again. The model weights are backed
up in CPU memory. Please make sure there's enough CPU memory to
store the model weights. Level 2 sleep will discard both the model
weights and the kv cache. The content of both the model weights
and kv cache is forgotten. Level 2 sleep is good for sleeping and
waking up the engine to run a different model or update the model,
where previous model weights are not needed. It reduces CPU memory
pressure.
Args:
level: The sleep level. Level 1 sleep will offload the model
weights and discard the kv cache. The content of kv cache
is forgotten. Level 1 sleep is good for sleeping and waking
up the engine to run the same model again. The model weights
are backed up in CPU memory. Please make sure there's enough
CPU memory to store the model weights. Level 2 sleep will
discard both the model weights and the kv cache. The content
of both the model weights and kv cache is forgotten. Level 2
sleep is good for sleeping and waking up the engine to run a
different model or update the model, where previous model
weights are not needed. It reduces CPU memory pressure.
"""
self.reset_prefix_cache()
self.llm_engine.sleep(level=level)
def wake_up(self):
def wake_up(self, tags: Optional[list[str]] = None):
"""
Wake up the engine from sleep mode. See the :meth:`sleep` method
for more details."""
self.llm_engine.wake_up()
for more details.
Args:
tags: An optional list of tags to reallocate the engine memory
for specific memory allocations. Values must be in
("weights", "kv_cache",). If None, all memory is reallocated.
wake_up should be called with all tags (or None) before the
engine is used again.
"""
self.llm_engine.wake_up(tags)
# LEGACY
def _convert_v1_inputs(

View File

@@ -705,7 +705,6 @@ if envs.VLLM_SERVER_DEV_MODE:
async def sleep(raw_request: Request):
# get POST params
level = raw_request.query_params.get("level", "1")
logger.info("sleep the engine with level %s", level)
await engine_client(raw_request).sleep(int(level))
# FIXME: in v0 with frontend multiprocessing, the sleep command
# is sent but does not finish yet when we return a response.
@@ -713,8 +712,12 @@ if envs.VLLM_SERVER_DEV_MODE:
@router.post("/wake_up")
async def wake_up(raw_request: Request):
logger.info("wake up the engine")
await engine_client(raw_request).wake_up()
tags = raw_request.query_params.getlist("tags")
if tags == []:
# set to None to wake up all tags if no tags are provided
tags = None
logger.info("wake up the engine with tags: %s", tags)
await engine_client(raw_request).wake_up(tags)
# FIXME: in v0 with frontend multiprocessing, the wake-up command
# is sent but does not finish yet when we return a response.
return Response(status_code=200)