[V1][Core] Fix memory issue with logits & sampling (#14508)

Signed-off-by: Roger Wang <ywang@roblox.com>
Co-authored-by: Varun Sundar Rabindranath <3337719+varun-sundar-rabindranath@users.noreply.github.com>
This commit is contained in:
Roger Wang
2025-03-10 21:03:41 -07:00
committed by GitHub
parent c982ac5722
commit 1fc973c0b5
5 changed files with 139 additions and 91 deletions

View File

@@ -119,6 +119,8 @@ class Worker(WorkerBase):
self.model_runner: GPUModelRunner = GPUModelRunner(
self.vllm_config, self.device)
# FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
# to hijack tensor allocation.
def load_model(self) -> None:
if self.vllm_config.model_config.enable_sleep_mode:
allocator = CuMemAllocator.get_instance()
@@ -211,6 +213,27 @@ class Worker(WorkerBase):
self.model_runner._dummy_run(size)
if not self.model_config.enforce_eager:
self.model_runner.capture_model()
# Warm up sampler and preallocate memory buffer for logits and other
# sampling related tensors of max possible shape to avoid memory
# fragmentation issue.
# NOTE: This is called after `capture_model` on purpose to prevent
# memory buffers from being cleared by `torch.cuda.empty_cache`.
try:
max_num_reqs = min(self.scheduler_config.max_num_seqs,
self.scheduler_config.max_num_batched_tokens)
self.model_runner._dummy_sampler_run(
hidden_states=self.model_runner._dummy_run(
num_tokens=max_num_reqs))
except RuntimeError as e:
if 'out of memory' in str(e):
raise RuntimeError(
"CUDA out of memory occurred when warming up sampler. "
"Please try lowering `gpu_memory_utilization` when "
"initializing the engine.") from None
else:
raise e
# Reset the seed to ensure that the random state is not affected by
# the model initialization and profiling.
set_random_seed(self.model_config.seed)