[V1][Core] Fix memory issue with logits & sampling (#14508)

Signed-off-by: Roger Wang <ywang@roblox.com> Co-authored-by: Varun Sundar Rabindranath <3337719+varun-sundar-rabindranath@users.noreply.github.com>
2025-03-10 21:03:41 -07:00
parent c982ac5722
commit 1fc973c0b5
5 changed files with 139 additions and 91 deletions
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -119,6 +119,8 @@ class Worker(WorkerBase):
        self.model_runner: GPUModelRunner = GPUModelRunner(
            self.vllm_config, self.device)

+    # FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
+    # to hijack tensor allocation.
    def load_model(self) -> None:
        if self.vllm_config.model_config.enable_sleep_mode:
            allocator = CuMemAllocator.get_instance()
@@ -211,6 +213,27 @@ class Worker(WorkerBase):
            self.model_runner._dummy_run(size)
        if not self.model_config.enforce_eager:
            self.model_runner.capture_model()
+
+        # Warm up sampler and preallocate memory buffer for logits and other
+        # sampling related tensors of max possible shape to avoid memory
+        # fragmentation issue.
+        # NOTE: This is called after `capture_model` on purpose to prevent
+        # memory buffers from being cleared by `torch.cuda.empty_cache`.
+        try:
+            max_num_reqs = min(self.scheduler_config.max_num_seqs,
+                               self.scheduler_config.max_num_batched_tokens)
+            self.model_runner._dummy_sampler_run(
+                hidden_states=self.model_runner._dummy_run(
+                    num_tokens=max_num_reqs))
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                raise RuntimeError(
+                    "CUDA out of memory occurred when warming up sampler. "
+                    "Please try lowering `gpu_memory_utilization` when "
+                    "initializing the engine.") from None
+            else:
+                raise e
+
        # Reset the seed to ensure that the random state is not affected by
        # the model initialization and profiling.
        set_random_seed(self.model_config.seed)