[MRV2] Fix hanging issue with DeepSeek V3.2 by setting skip_attn=False (#39098)
Signed-off-by: WoosukKwon <woosuk.kwon@berkeley.edu> Signed-off-by: Woosuk Kwon <woosuk@inferact.ai>
This commit is contained in:
@@ -391,12 +391,17 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
self,
|
||||
num_tokens: int,
|
||||
*args,
|
||||
skip_attn: bool = True,
|
||||
skip_attn: bool = False,
|
||||
uniform_decode: bool = False,
|
||||
skip_eplb: bool = False,
|
||||
is_profile: bool = False,
|
||||
**kwargs,
|
||||
) -> tuple[torch.Tensor | None, torch.Tensor | None]:
|
||||
if skip_attn and not is_profile:
|
||||
raise ValueError(
|
||||
"skip_attn must only be True for initial memory profiling."
|
||||
)
|
||||
|
||||
# Create a dummy scheduler output.
|
||||
num_reqs = min(num_tokens, self.max_num_reqs)
|
||||
if uniform_decode:
|
||||
@@ -988,6 +993,10 @@ class GPUModelRunner(LoRAModelRunnerMixin):
|
||||
if not skip_attn_for_dummy_run:
|
||||
block_tables, slot_mappings = self.prepare_dummy_attn(input_batch)
|
||||
else:
|
||||
assert batch_desc.cg_mode != CUDAGraphMode.FULL, (
|
||||
"Attention metadata must be prepared for dummy runs when using "
|
||||
"FULL cudagraph mode."
|
||||
)
|
||||
block_tables = None
|
||||
slot_mappings = None
|
||||
# FIXME(woosuk): Fix warmup for LoRA.
|
||||
|
||||
Reference in New Issue
Block a user