diff --git a/vllm/forward_context.py b/vllm/forward_context.py index 9a831a2e2..a5c833b5e 100644 --- a/vllm/forward_context.py +++ b/vllm/forward_context.py @@ -203,7 +203,7 @@ class ForwardContext: attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]] slot_mapping: dict[str, torch.Tensor] | list[dict[str, torch.Tensor]] """ - Type Dict[str, AttentionMetadata] for v1, map from layer_name of each + Type Dict[str, AttentionMetadata] for v1, map from layer_name of each attention layer to its attention metadata Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one for each microbatch. @@ -339,8 +339,10 @@ def set_forward_context( forward_start_time = time.perf_counter() dp_metadata: DPMetadata | None = None - if vllm_config.parallel_config.data_parallel_size > 1 and ( - attn_metadata is not None or num_tokens is not None + if ( + vllm_config.parallel_config.data_parallel_size > 1 + and vllm_config.parallel_config.is_moe_model is not False + and (attn_metadata is not None or num_tokens is not None) ): # If num_tokens_across_dp hasn't already been initialized, then # initialize it here. Both DP padding and Microbatching will be