Revert "[V1][Core] Fix memory issue with logits & sampling" (#14504)

Signed-off-by: Roger Wang <ywang@roblox.com> Co-authored-by: Roger Wang <ywang@roblox.com>
2025-03-08 20:43:37 -05:00
parent eb8b5eb183
commit 5f0b53c6ea
3 changed files with 30 additions and 70 deletions
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -119,8 +119,6 @@ class Worker(WorkerBase):
        self.model_runner: GPUModelRunner = GPUModelRunner(
            self.vllm_config, self.device)

-    # FIXME(youkaichao & ywang96): Use TorchDispatchMode instead of memory pool
-    # to hijack tensor allocation.
    def load_model(self) -> None:
        if self.vllm_config.model_config.enable_sleep_mode:
            allocator = CuMemAllocator.get_instance()
@@ -213,25 +211,6 @@ class Worker(WorkerBase):
            self.model_runner._dummy_run(size)
        if not self.model_config.enforce_eager:
            self.model_runner.capture_model()
-
-        # Warm up sampler and preallocate memory buffer for logits and other
-        # sampling related tensors of max possible shape to avoid memory
-        # fragmentation issue.
-        # NOTE: This is called after `capture_model` on purpose to prevent
-        # memory buffers from being cleared by `torch.cuda.empty_cache`.
-        try:
-            self.model_runner._dummy_sampler_run(
-                hidden_states=self.model_runner._dummy_run(
-                    num_tokens=self.scheduler_config.max_num_seqs))
-        except RuntimeError as e:
-            if 'out of memory' in str(e):
-                raise RuntimeError(
-                    "CUDA out of memory occurred when warming up sampler. "
-                    "Please try lowering `gpu_memory_utilization` when "
-                    "initializing the engine.") from None
-            else:
-                raise e
-
        # Reset the seed to ensure that the random state is not affected by
        # the model initialization and profiling.
        set_random_seed(self.model_config.seed)