From f186cfe75e452aeb76f5233da7392d51ee34d3ef Mon Sep 17 00:00:00 2001 From: Woosuk Kwon Date: Mon, 6 Apr 2026 12:55:13 -0700 Subject: [PATCH] [MRV2] Fix hanging issue with DeepSeek V3.2 by setting `skip_attn=False` (#39098) Signed-off-by: WoosukKwon Signed-off-by: Woosuk Kwon --- vllm/v1/worker/gpu/model_runner.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/vllm/v1/worker/gpu/model_runner.py b/vllm/v1/worker/gpu/model_runner.py index a2f83c52e..56df70fc0 100644 --- a/vllm/v1/worker/gpu/model_runner.py +++ b/vllm/v1/worker/gpu/model_runner.py @@ -391,12 +391,17 @@ class GPUModelRunner(LoRAModelRunnerMixin): self, num_tokens: int, *args, - skip_attn: bool = True, + skip_attn: bool = False, uniform_decode: bool = False, skip_eplb: bool = False, is_profile: bool = False, **kwargs, ) -> tuple[torch.Tensor | None, torch.Tensor | None]: + if skip_attn and not is_profile: + raise ValueError( + "skip_attn must only be True for initial memory profiling." + ) + # Create a dummy scheduler output. num_reqs = min(num_tokens, self.max_num_reqs) if uniform_decode: @@ -988,6 +993,10 @@ class GPUModelRunner(LoRAModelRunnerMixin): if not skip_attn_for_dummy_run: block_tables, slot_mappings = self.prepare_dummy_attn(input_batch) else: + assert batch_desc.cg_mode != CUDAGraphMode.FULL, ( + "Attention metadata must be prepared for dummy runs when using " + "FULL cudagraph mode." + ) block_tables = None slot_mappings = None # FIXME(woosuk): Fix warmup for LoRA.