[Distributed] Basic set of configuration for large EP deployment on GB200 (#27328)

Signed-off-by: Pengchao Wang <wpc@fb.com> Co-authored-by: Lu Fang <30275821+houseroad@users.noreply.github.com>
2025-10-24 14:16:44 -07:00
parent 0402428200
commit d95d0f4b98
2 changed files with 25 additions and 1 deletions
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -277,7 +277,7 @@ class DeepEPHTAll2AllManager(DeepEPAll2AllManagerBase):
        num_rdma_bytes = None
        num_qps_per_rank = None

-        if self.internode:
+        if self.internode and not envs.VLLM_DEEPEP_HIGH_THROUGHPUT_FORCE_INTRA_NODE:
            num_rdma_bytes = envs.VLLM_DEEPEP_BUFFER_SIZE_MB * 1024 * 1024
            num_qps_per_rank = self.num_sms // 2
        else:
@@ -363,6 +363,8 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
            num_rdma_bytes=num_rdma_bytes,
            low_latency_mode=True,
            num_qps_per_rank=num_qps_per_rank,
+            allow_nvlink_for_low_latency_mode=envs.VLLM_DEEPEP_LOW_LATENCY_ALLOW_NVLINK,
+            allow_mnnvl=envs.VLLM_DEEPEP_LOW_LATENCY_USE_MNNVL,
        )

    def get_handle(self, kwargs):