[Core] Support fully transparent sleep mode (#11743)

Signed-off-by: youkaichao <youkaichao@gmail.com>
2025-01-22 14:39:32 +08:00
parent 4004f144f3
commit 68ad4e3a8d
14 changed files with 877 additions and 40 deletions
--- a/vllm/entrypoints/llm.py
+++ b/vllm/entrypoints/llm.py
@@ -1132,6 +1132,29 @@ class LLM:
    def stop_profile(self) -> None:
        self.llm_engine.stop_profile()

+    def sleep(self, level: int = 1):
+        """
+        Put the engine to sleep. The engine should not process any requests.
+        The caller should guarantee that no requests are being processed
+        during the sleep period, before `wake_up` is called.
+
+        :param level: The sleep level. Level 1 sleep will offload the model 
+            weights and discard the kv cache. The content of kv cache is 
+            forgotten. Level 1 sleep is good for sleeping and waking up the 
+            engine to run the same model again. The model weights are backed 
+            up in CPU memory. Please make sure there's enough CPU memory to 
+            store the model weights. Level 2 sleep will discard both the model 
+            weights and the kv cache. The content of both the model weights 
+            and kv cache is forgotten. Level 2 sleep is good for sleeping and 
+            waking up the engine to run a different model or update the model, 
+            where previous model weights are not needed. It reduces CPU memory 
+            pressure.
+        """
+        self.llm_engine.sleep(level=level)
+
+    def wake_up(self):
+        self.llm_engine.wake_up()
+
    # LEGACY
    def _convert_v1_inputs(
        self,