Signed-off-by: Han <zh950713@gmail.com>
This commit is contained in:
@@ -54,6 +54,9 @@ class Worker(WorkerBase):
|
|||||||
from vllm.utils import init_cached_hf_modules
|
from vllm.utils import init_cached_hf_modules
|
||||||
init_cached_hf_modules()
|
init_cached_hf_modules()
|
||||||
|
|
||||||
|
# Buffers saved before sleep
|
||||||
|
self._sleep_saved_buffers: dict[str, torch.Tensor] = {}
|
||||||
|
|
||||||
# Torch profiler. Enabled and configured through env vars:
|
# Torch profiler. Enabled and configured through env vars:
|
||||||
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
||||||
if envs.VLLM_TORCH_PROFILER_DIR:
|
if envs.VLLM_TORCH_PROFILER_DIR:
|
||||||
@@ -73,6 +76,15 @@ class Worker(WorkerBase):
|
|||||||
|
|
||||||
def sleep(self, level: int = 1) -> None:
|
def sleep(self, level: int = 1) -> None:
|
||||||
free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
|
free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
|
||||||
|
|
||||||
|
# Save the buffers before level 2 sleep
|
||||||
|
if level == 2:
|
||||||
|
model = self.model_runner.model
|
||||||
|
self._sleep_saved_buffers = {
|
||||||
|
name: buffer.cpu().clone()
|
||||||
|
for name, buffer in model.named_buffers()
|
||||||
|
}
|
||||||
|
|
||||||
allocator = CuMemAllocator.get_instance()
|
allocator = CuMemAllocator.get_instance()
|
||||||
allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
|
allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
|
||||||
free_bytes_after_sleep, total = torch.cuda.mem_get_info()
|
free_bytes_after_sleep, total = torch.cuda.mem_get_info()
|
||||||
@@ -88,6 +100,14 @@ class Worker(WorkerBase):
|
|||||||
allocator = CuMemAllocator.get_instance()
|
allocator = CuMemAllocator.get_instance()
|
||||||
allocator.wake_up(tags)
|
allocator.wake_up(tags)
|
||||||
|
|
||||||
|
# Restore the buffers after level 2 sleep
|
||||||
|
if len(self._sleep_saved_buffers):
|
||||||
|
model = self.model_runner.model
|
||||||
|
for name, buffer in model.named_buffers():
|
||||||
|
if name in self._sleep_saved_buffers:
|
||||||
|
buffer.data.copy_(self._sleep_saved_buffers[name].data)
|
||||||
|
self._sleep_saved_buffers = {}
|
||||||
|
|
||||||
def init_device(self):
|
def init_device(self):
|
||||||
if self.device_config.device.type == "cuda":
|
if self.device_config.device.type == "cuda":
|
||||||
# torch.distributed.all_reduce does not free the input tensor until
|
# torch.distributed.all_reduce does not free the input tensor until
|
||||||
|
|||||||
@@ -95,6 +95,9 @@ class Worker(LocalOrDistributedWorkerBase):
|
|||||||
self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
|
self.gpu_cache: Optional[List[List[torch.Tensor]]] = None
|
||||||
self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
|
self._seq_group_metadata_cache: Dict[str, SequenceGroupMetadata] = {}
|
||||||
|
|
||||||
|
# Buffers saved before sleep
|
||||||
|
self._sleep_saved_buffers: Dict[str, torch.Tensor] = {}
|
||||||
|
|
||||||
# Torch profiler. Enabled and configured through env vars:
|
# Torch profiler. Enabled and configured through env vars:
|
||||||
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
# VLLM_TORCH_PROFILER_DIR=/path/to/save/trace
|
||||||
if envs.VLLM_TORCH_PROFILER_DIR:
|
if envs.VLLM_TORCH_PROFILER_DIR:
|
||||||
@@ -124,6 +127,15 @@ class Worker(LocalOrDistributedWorkerBase):
|
|||||||
|
|
||||||
def sleep(self, level: int = 1) -> None:
|
def sleep(self, level: int = 1) -> None:
|
||||||
free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
|
free_bytes_before_sleep = torch.cuda.mem_get_info()[0]
|
||||||
|
|
||||||
|
# Save the buffers before level 2 sleep
|
||||||
|
if level == 2:
|
||||||
|
model = self.model_runner.model
|
||||||
|
self._sleep_saved_buffers = {
|
||||||
|
name: buffer.cpu().clone()
|
||||||
|
for name, buffer in model.named_buffers()
|
||||||
|
}
|
||||||
|
|
||||||
allocator = CuMemAllocator.get_instance()
|
allocator = CuMemAllocator.get_instance()
|
||||||
allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
|
allocator.sleep(offload_tags=("weights", ) if level == 1 else tuple())
|
||||||
free_bytes_after_sleep, total = torch.cuda.mem_get_info()
|
free_bytes_after_sleep, total = torch.cuda.mem_get_info()
|
||||||
@@ -139,6 +151,14 @@ class Worker(LocalOrDistributedWorkerBase):
|
|||||||
allocator = CuMemAllocator.get_instance()
|
allocator = CuMemAllocator.get_instance()
|
||||||
allocator.wake_up(tags=tags)
|
allocator.wake_up(tags=tags)
|
||||||
|
|
||||||
|
# Restore the buffers after level 2 sleep
|
||||||
|
if len(self._sleep_saved_buffers):
|
||||||
|
model = self.model_runner.model
|
||||||
|
for name, buffer in model.named_buffers():
|
||||||
|
if name in self._sleep_saved_buffers:
|
||||||
|
buffer.data.copy_(self._sleep_saved_buffers[name].data)
|
||||||
|
self._sleep_saved_buffers = {}
|
||||||
|
|
||||||
def init_device(self) -> None:
|
def init_device(self) -> None:
|
||||||
if self.device_config.device.type == "cuda":
|
if self.device_config.device.type == "cuda":
|
||||||
# torch.distributed.all_reduce does not free the input tensor until
|
# torch.distributed.all_reduce does not free the input tensor until
|
||||||
|
|||||||
Reference in New Issue
Block a user