[MRV2] Fix hanging issue with DeepSeek V3.2 by setting skip_attn=False (#39098)

Signed-off-by: WoosukKwon <woosuk.kwon@berkeley.edu>
Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
This commit is contained in:
Woosuk Kwon
2026-04-06 12:55:13 -07:00
committed by GitHub
parent dfa5062a8f
commit f186cfe75e

View File

@@ -391,12 +391,17 @@ class GPUModelRunner(LoRAModelRunnerMixin):
self,
num_tokens: int,
*args,
skip_attn: bool = True,
skip_attn: bool = False,
uniform_decode: bool = False,
skip_eplb: bool = False,
is_profile: bool = False,
**kwargs,
) -> tuple[torch.Tensor | None, torch.Tensor | None]:
if skip_attn and not is_profile:
raise ValueError(
"skip_attn must only be True for initial memory profiling."
)
# Create a dummy scheduler output.
num_reqs = min(num_tokens, self.max_num_reqs)
if uniform_decode:
@@ -988,6 +993,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
if not skip_attn_for_dummy_run:
block_tables, slot_mappings = self.prepare_dummy_attn(input_batch)
else:
assert batch_desc.cg_mode != CUDAGraphMode.FULL, (
"Attention metadata must be prepared for dummy runs when using "
"FULL cudagraph mode."
)
block_tables = None
slot_mappings = None
# FIXME(woosuk): Fix warmup for LoRA.