[MRV2] Fix hanging issue with DeepSeek V3.2 by setting skip_attn=False (#39098)

Signed-off-by: WoosukKwon <woosuk.kwon@berkeley.edu> Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
2026-04-06 12:55:13 -07:00
parent dfa5062a8f
commit f186cfe75e
1 changed files with 10 additions and 1 deletions
--- a/vllm/v1/worker/gpu/model_runner.py
+++ b/vllm/v1/worker/gpu/model_runner.py
@@ -391,12 +391,17 @@ class GPUModelRunner(LoRAModelRunnerMixin):
        self,
        num_tokens: int,
        *args,
-        skip_attn: bool = True,
+        skip_attn: bool = False,
        uniform_decode: bool = False,
        skip_eplb: bool = False,
        is_profile: bool = False,
        **kwargs,
    ) -> tuple[torch.Tensor | None, torch.Tensor | None]:
+        if skip_attn and not is_profile:
+            raise ValueError(
+                "skip_attn must only be True for initial memory profiling."
+            )
+
        # Create a dummy scheduler output.
        num_reqs = min(num_tokens, self.max_num_reqs)
        if uniform_decode:
@@ -988,6 +993,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
            if not skip_attn_for_dummy_run:
                block_tables, slot_mappings = self.prepare_dummy_attn(input_batch)
            else:
+                assert batch_desc.cg_mode != CUDAGraphMode.FULL, (
+                    "Attention metadata must be prepared for dummy runs when using "
+                    "FULL cudagraph mode."
+                )
                block_tables = None
                slot_mappings = None
            # FIXME(woosuk): Fix warmup for LoRA.