[V1] Fix Compilation config & Enable CUDA graph by default (#10528)

Signed-off-by: Woosuk Kwon <woosuk.kwon@berkeley.edu>
2024-11-21 12:53:39 -08:00
parent 7560ae5caf
commit f9310cbd0c
3 changed files with 62 additions and 42 deletions
--- a/vllm/config.py
+++ b/vllm/config.py
@@ -2370,7 +2370,7 @@ class VllmConfig:
        if self.compilation_config is None:
            self.compilation_config = CompilationConfig()
-        if envs.VLLM_USE_V1:
+        if envs.VLLM_USE_V1 and not self.model_config.enforce_eager:
            # NOTE(woosuk): Currently, we use inductor because the piecewise
            # CUDA graphs do not work properly with the custom CUDA kernels.
            # FIXME(woosuk): Disable inductor to reduce the compilation time
@@ -2380,6 +2380,7 @@ class VllmConfig:
            self.compilation_config.use_inductor = True
            self.compilation_config.pass_config.enable_fusion = False
            self.compilation_config.pass_config.enable_reshape = False
            self.compilation_config.level = CompilationLevel.PIECEWISE
        current_platform.check_and_update_config(self)
--- a/vllm/v1/worker/gpu_model_runner.py
+++ b/vllm/v1/worker/gpu_model_runner.py
@@ -1,3 +1,4 @@
 import gc
 import time
 from dataclasses import dataclass
 from typing import TYPE_CHECKING, Dict, List, Optional, Set, Tuple
@@ -515,7 +516,25 @@ class GPUModelRunner:
        logger.info("Loading model weights took %.4f GB",
                    self.model_memory_usage / float(2**30))
-    def _dummy_run(self, model: nn.Module, num_tokens: int) -> None:
+    @torch.inference_mode()
    def _dummy_run(
        self,
        model: nn.Module,
        num_tokens: int,
        kv_caches: List[torch.Tensor],
    ) -> torch.Tensor:
        with set_forward_context(None):
            hidden_states = model(
                input_ids=None,
                positions=self.positions[:num_tokens],
                kv_caches=kv_caches,
                attn_metadata=None,
                inputs_embeds=self.inputs_embeds[:num_tokens])
        return hidden_states
    def profile_run(self) -> None:
        # TODO(woosuk): Profile the max memory usage of the encoder and
        # the encoder cache.
        # use an empty tensor instead of `None`` to force Dynamo to pass
        # it by reference, rather by specializing on the value `None`.
        # the `dtype` argument does not matter, and we use `float32` as
@@ -527,23 +546,17 @@ class GPUModelRunner:
            torch.tensor([], dtype=torch.float32, device=self.device)
            for _ in range(self.num_attn_layers)
        ]
-        with set_forward_context(None):  # noqa: SIM117
+        with set_compile_context(self.cudagraph_batch_sizes):
-            with set_compile_context(self.cudagraph_batch_sizes):
+            # Trigger compilation for general shape.
-                # Trigger compilation for general shape.
+            hidden_states = self._dummy_run(self.model, self.max_num_tokens,
-                model(input_ids=None,
+                                            dummy_kv_caches)
-                      positions=self.positions,
+        logits = self.model.compute_logits(hidden_states, None)
-                      kv_caches=dummy_kv_caches,
+        logits = logits[:self.max_num_tokens]
-                      attn_metadata=None,
+        # TODO(woosuk): Consider the memory usage of the sampler.
                      inputs_embeds=self.inputs_embeds)
    @torch.inference_mode()
    def profile_run(self) -> None:
        # TODO(woosuk): Profile the max memory usage of the encoder and
        # the encoder cache.
        self._dummy_run(self.model, self.max_num_tokens)
        torch.cuda.synchronize()
        del hidden_states, logits
        gc.collect()
    @torch.inference_mode()
    def capture_model(self) -> None:
        if not self.use_cuda_graph:
            logger.warning(
@@ -554,18 +567,11 @@ class GPUModelRunner:
        start_time = time.perf_counter()
        start_free_gpu_memory = torch.cuda.mem_get_info()[0]
-        with set_forward_context(None):
+        # Trigger CUDA graph capture for specific shapes.
-            # Trigger CUDA graph capture for specific shapes.
+        # Capture the large shapes first so that the smaller shapes
-            # Capture the large shapes first so that the smaller shapes
+        # can reuse the memory pool allocated for the large shapes.
-            # can reuse the memory pool allocated for the large shapes.
+        for num_tokens in reversed(self.cudagraph_batch_sizes):
-            for num_tokens in reversed(self.cudagraph_batch_sizes):
+            self._dummy_run(self.model, num_tokens, self.kv_caches)
                self.model(
                    input_ids=None,
                    positions=self.positions[:num_tokens],
                    kv_caches=self.kv_caches,
                    attn_metadata=None,
                    inputs_embeds=self.inputs_embeds[:num_tokens],
                )
        end_time = time.perf_counter()
        end_free_gpu_memory = torch.cuda.mem_get_info()[0]
--- a/vllm/v1/worker/gpu_worker.py
+++ b/vllm/v1/worker/gpu_worker.py
@@ -105,35 +105,48 @@ class Worker:
        # Profile the memory usage of the model and get the maximum number of
        # cache blocks that can be allocated with the remaining free memory.
        torch.cuda.empty_cache()
        torch.cuda.reset_peak_memory_stats()
        _, total_gpu_memory = torch.cuda.mem_get_info()
        # Execute a forward pass with dummy inputs to profile the memory usage
        # of the model.
        self.model_runner.profile_run()
        # Calculate the number of blocks that can be allocated with the
        # profiled peak memory.
        torch.cuda.synchronize()
-        free_gpu_memory, total_gpu_memory = torch.cuda.mem_get_info()
+
        free_gpu_memory, _ = torch.cuda.mem_get_info()
        # NOTE(woosuk): Here we assume that the other processes using the same
        # GPU did not change their memory usage during the profiling.
-        peak_memory = self.init_gpu_memory - free_gpu_memory
+        assert self.init_gpu_memory > free_gpu_memory, (
        assert peak_memory > 0, (
            "Error in memory profiling. "
            f"Initial free memory {self.init_gpu_memory}, current free memory"
            f" {free_gpu_memory}. This happens when the GPU memory was "
            "not properly cleaned up before initializing the vLLM instance.")
        # Get the peak memory allocation recorded by torch
        peak_memory = torch.cuda.memory_stats()["allocated_bytes.all.peak"]
        # Check for any memory left around that may have been allocated on the
        # gpu outside of `torch`. NCCL operations, for example, can use a few
        # GB during a forward pass
        torch.cuda.empty_cache()
        torch_allocated_bytes = torch.cuda.memory_stats(
        )["allocated_bytes.all.current"]
        total_allocated_bytes = torch.cuda.mem_get_info(
        )[1] - torch.cuda.mem_get_info()[0]
        non_torch_allocations = total_allocated_bytes - torch_allocated_bytes
        if non_torch_allocations > 0:
            peak_memory += non_torch_allocations
        available_kv_cache_memory = (
            total_gpu_memory * self.cache_config.gpu_memory_utilization -
            peak_memory)
        # Calculate the number of blocks that can be allocated with the
        # profiled peak memory.
        cache_block_size = _get_cache_block_size(self.cache_config,
                                                 self.model_config,
                                                 self.parallel_config)
-        num_gpu_blocks = int(
+        num_gpu_blocks = int(available_kv_cache_memory // cache_block_size)
            (total_gpu_memory * self.cache_config.gpu_memory_utilization -
             peak_memory) // cache_block_size)
        num_gpu_blocks = max(num_gpu_blocks, 0)
        # if self.model_runner.lora_manager:
        #     self.model_runner.remove_all_loras()
        gc.collect()
        torch.cuda.empty_cache()
        return num_gpu_blocks, 0
    def initialize_cache(self, num_gpu_blocks: int) -> None: