[Core] Add All-to-All communication backend for DCP (#34883)

Signed-off-by: Sungsoo Ha <sungsooh@nvidia.com>
Signed-off-by: sungsoo ha <hasungsoo@gmail.com>
Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com>
Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com>
Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
This commit is contained in:
sungsoo ha
2026-03-04 07:01:57 -08:00
committed by GitHub
parent ead7bde1ab
commit 6cb901093f
8 changed files with 658 additions and 17 deletions

View File

@@ -36,6 +36,7 @@ ExpertPlacementStrategy = Literal["linear", "round_robin"]
DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
DataParallelBackend = Literal["ray", "mp"]
EPLBPolicyOption = Literal["default"]
DCPCommBackend = Literal["ag_rs", "a2a"]
All2AllBackend = Literal[
"naive",
"pplx",
@@ -287,6 +288,14 @@ class ParallelConfig:
and will be deprecated when PCP is fully supported.
"""
dcp_comm_backend: DCPCommBackend = "ag_rs"
"""Communication backend for Decode Context Parallel (DCP).
- "ag_rs": AllGather + ReduceScatter (default, existing behavior)
- "a2a": All-to-All exchange of partial outputs + LSE, then
combine with Triton kernel. Reduces NCCL calls from 3 to 2
per layer for MLA models.
"""
cp_kv_cache_interleave_size: int = 1
"""Interleave size of kv_cache storage while using DCP or PCP.
For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
@@ -392,6 +401,11 @@ class ParallelConfig:
f"dcp_size={self.decode_context_parallel_size}."
)
if self.dcp_comm_backend == "a2a" and self.decode_context_parallel_size <= 1:
raise ValueError(
"dcp_comm_backend='a2a' requires decode_context_parallel_size > 1."
)
return self
@property