From 4a9ce1784c4497d71b9a4497619f60d97271bd00 Mon Sep 17 00:00:00 2001
From: Lionel Villard <villard@us.ibm.com>
Date: Tue, 1 Apr 2025 01:58:58 -0400
Subject: [PATCH] [sleep mode] clear pytorch cache after sleep (#15248)

Signed-off-by: <villard@us.ibm.com>
---
 vllm/device_allocator/cumem.py | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py
index 0291fd9e1..f666c18c1 100644
--- a/vllm/device_allocator/cumem.py
+++ b/vllm/device_allocator/cumem.py
@@ -8,6 +8,7 @@
 # not sure why, they are created from a different context.
 # the only successful approach is to call cuda driver API in C.
 import dataclasses
+import gc
 import os
 from contextlib import contextmanager
 from typing import Any, Callable, Dict, Optional, Tuple, Union
@@ -175,7 +176,7 @@ class CuMemAllocator:
                                          str]] = None) -> None:
         """
         Put the allocator in sleep mode.
-        All data in the memory allocation with the specified tag will be 
+        All data in the memory allocation with the specified tag will be
         offloaded to CPU memory, and others will be discarded.
 
         :param offload_tags: The tags of the memory allocation that will be
@@ -204,10 +205,13 @@ class CuMemAllocator:
                 data.cpu_backup_tensor = cpu_backup_tensor
             unmap_and_release(handle)
 
+        gc.collect()
+        torch.cuda.empty_cache()
+
     def wake_up(self):
         """
         Wake up the allocator from sleep mode.
-        All data that is previously offloaded will be loaded back to GPU 
+        All data that is previously offloaded will be loaded back to GPU
         memory, and the rest of the data will have empty memory."""
         for ptr, data in self.pointer_to_data.items():
             handle = data.handle
@@ -225,7 +229,7 @@ class CuMemAllocator:
     def use_memory_pool(self, tag: Optional[str] = None):
         """
         A context manager to use the memory pool.
-        All memory allocation created inside the context will be allocated 
+        All memory allocation created inside the context will be allocated
         in the memory pool, and has the specified tag.
 
         :param tag: The tag of the memory allocation. If None, the default tag