[Core] Support fully transparent sleep mode (#11743)
Signed-off-by: youkaichao <youkaichao@gmail.com>
This commit is contained in:
@@ -1132,6 +1132,29 @@ class LLM:
|
||||
def stop_profile(self) -> None:
|
||||
self.llm_engine.stop_profile()
|
||||
|
||||
def sleep(self, level: int = 1):
|
||||
"""
|
||||
Put the engine to sleep. The engine should not process any requests.
|
||||
The caller should guarantee that no requests are being processed
|
||||
during the sleep period, before `wake_up` is called.
|
||||
|
||||
:param level: The sleep level. Level 1 sleep will offload the model
|
||||
weights and discard the kv cache. The content of kv cache is
|
||||
forgotten. Level 1 sleep is good for sleeping and waking up the
|
||||
engine to run the same model again. The model weights are backed
|
||||
up in CPU memory. Please make sure there's enough CPU memory to
|
||||
store the model weights. Level 2 sleep will discard both the model
|
||||
weights and the kv cache. The content of both the model weights
|
||||
and kv cache is forgotten. Level 2 sleep is good for sleeping and
|
||||
waking up the engine to run a different model or update the model,
|
||||
where previous model weights are not needed. It reduces CPU memory
|
||||
pressure.
|
||||
"""
|
||||
self.llm_engine.sleep(level=level)
|
||||
|
||||
def wake_up(self):
|
||||
self.llm_engine.wake_up()
|
||||
|
||||
# LEGACY
|
||||
def _convert_v1_inputs(
|
||||
self,
|
||||
|
||||
Reference in New Issue
Block a user