[Core] Add All-to-All communication backend for DCP (#34883)

Signed-off-by: Sungsoo Ha <sungsooh@nvidia.com> Signed-off-by: sungsoo ha <hasungsoo@gmail.com> Co-authored-by: Claude Opus 4.6 <noreply@anthropic.com> Co-authored-by: Lucas Wilkinson <LucasWilkinson@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2026-03-04 07:01:57 -08:00
parent ead7bde1ab
commit 6cb901093f
8 changed files with 658 additions and 17 deletions
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -36,6 +36,7 @@ ExpertPlacementStrategy = Literal["linear", "round_robin"]
 DistributedExecutorBackend = Literal["ray", "mp", "uni", "external_launcher"]
 DataParallelBackend = Literal["ray", "mp"]
 EPLBPolicyOption = Literal["default"]
+DCPCommBackend = Literal["ag_rs", "a2a"]
 All2AllBackend = Literal[
    "naive",
    "pplx",
@@ -287,6 +288,14 @@ class ParallelConfig:
    and will be deprecated when PCP is fully supported.

    """
+    dcp_comm_backend: DCPCommBackend = "ag_rs"
+    """Communication backend for Decode Context Parallel (DCP).
+    - "ag_rs": AllGather + ReduceScatter (default, existing behavior)
+    - "a2a": All-to-All exchange of partial outputs + LSE, then
+      combine with Triton kernel. Reduces NCCL calls from 3 to 2
+      per layer for MLA models.
+    """
+
    cp_kv_cache_interleave_size: int = 1
    """Interleave size of kv_cache storage while using DCP or PCP.
    For `total_cp_rank = pcp_rank * dcp_world_size + dcp_rank`,
@@ -392,6 +401,11 @@ class ParallelConfig:
                f"dcp_size={self.decode_context_parallel_size}."
            )

+        if self.dcp_comm_backend == "a2a" and self.decode_context_parallel_size <= 1:
+            raise ValueError(
+                "dcp_comm_backend='a2a' requires decode_context_parallel_size > 1."
+            )
+
        return self

    @property