diff --git a/vllm/forward_context.py b/vllm/forward_context.py
index 9a831a2e2..a5c833b5e 100644
--- a/vllm/forward_context.py
+++ b/vllm/forward_context.py
@@ -203,7 +203,7 @@ class ForwardContext:
     attn_metadata: dict[str, AttentionMetadata] | list[dict[str, AttentionMetadata]]
     slot_mapping: dict[str, torch.Tensor] | list[dict[str, torch.Tensor]]
     """
-    Type Dict[str, AttentionMetadata] for v1, map from layer_name of each 
+    Type Dict[str, AttentionMetadata] for v1, map from layer_name of each
     attention layer to its attention metadata
     Type List[Dict[str, AttentionMetadata]] for DBO. List of size two, one
     for each microbatch.
@@ -339,8 +339,10 @@ def set_forward_context(
         forward_start_time = time.perf_counter()
 
     dp_metadata: DPMetadata | None = None
-    if vllm_config.parallel_config.data_parallel_size > 1 and (
-        attn_metadata is not None or num_tokens is not None
+    if (
+        vllm_config.parallel_config.data_parallel_size > 1
+        and vllm_config.parallel_config.is_moe_model is not False
+        and (attn_metadata is not None or num_tokens is not None)
     ):
         # If num_tokens_across_dp hasn't already been initialized, then
         # initialize it here. Both DP padding and Microbatching will be