From ce9b1d76cfbbbabbc72e12f99ce4ce9b8265fae8 Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Sun, 22 Mar 2026 11:47:21 -0700 Subject: [PATCH] [MRV2] Skip hidden states allocation for PW CUDA graphs (#37818) Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/cudagraph_utils.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/vllm/v1/worker/gpu/cudagraph_utils.py b/vllm/v1/worker/gpu/cudagraph_utils.py index 2b94362a8..f18d8f320 100644 --- a/vllm/v1/worker/gpu/cudagraph_utils.py +++ b/vllm/v1/worker/gpu/cudagraph_utils.py @@ -263,6 +263,7 @@ class ModelCudaGraphManager(CudaGraphManager): decode_query_len: int, ): super().__init__(vllm_config, device, cudagraph_mode, decode_query_len) + # Used for FULL CUDA graphs. PW CUDA graphs do not use these. self.hidden_states: torch.Tensor | None = None self.aux_hidden_states: list[torch.Tensor] = [] self.use_aux_hidden_state_outputs = False @@ -326,6 +327,12 @@ class ModelCudaGraphManager(CudaGraphManager): **model_state.prepare_dummy_inputs(num_reqs, num_tokens), } model_output = model(**model_inputs) + + if cg_mode == CUDAGraphMode.PIECEWISE: + # PW CUDA graph internally handles the model outputs. + # No need to keep track of the hidden states. + return None + if self.use_aux_hidden_state_outputs: hidden_states, aux_hidden_states = model_output else: