From 4a9ce1784c4497d71b9a4497619f60d97271bd00 Mon Sep 17 00:00:00 2001 From: Lionel Villard Date: Tue, 1 Apr 2025 01:58:58 -0400 Subject: [PATCH] [sleep mode] clear pytorch cache after sleep (#15248) Signed-off-by: --- vllm/device_allocator/cumem.py | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/vllm/device_allocator/cumem.py b/vllm/device_allocator/cumem.py index 0291fd9e1..f666c18c1 100644 --- a/vllm/device_allocator/cumem.py +++ b/vllm/device_allocator/cumem.py @@ -8,6 +8,7 @@ # not sure why, they are created from a different context. # the only successful approach is to call cuda driver API in C. import dataclasses +import gc import os from contextlib import contextmanager from typing import Any, Callable, Dict, Optional, Tuple, Union @@ -175,7 +176,7 @@ class CuMemAllocator: str]] = None) -> None: """ Put the allocator in sleep mode. - All data in the memory allocation with the specified tag will be + All data in the memory allocation with the specified tag will be offloaded to CPU memory, and others will be discarded. :param offload_tags: The tags of the memory allocation that will be @@ -204,10 +205,13 @@ class CuMemAllocator: data.cpu_backup_tensor = cpu_backup_tensor unmap_and_release(handle) + gc.collect() + torch.cuda.empty_cache() + def wake_up(self): """ Wake up the allocator from sleep mode. - All data that is previously offloaded will be loaded back to GPU + All data that is previously offloaded will be loaded back to GPU memory, and the rest of the data will have empty memory.""" for ptr, data in self.pointer_to_data.items(): handle = data.handle @@ -225,7 +229,7 @@ class CuMemAllocator: def use_memory_pool(self, tag: Optional[str] = None): """ A context manager to use the memory pool. - All memory allocation created inside the context will be allocated + All memory allocation created inside the context will be allocated in the memory pool, and has the specified tag. :param tag: The tag of the memory allocation. If None, the default tag