[Misc] Fixes and Optimizations for DeepEP + DeepGEMM combination. (#19298)

Signed-off-by: Varun <vsundarr@redhat.com> Co-authored-by: Varun <vsundarr@redhat.com>
2025-06-09 10:50:39 -04:00
parent b8089195b4
commit 5cf2daea9a
8 changed files with 98 additions and 36 deletions
--- a/vllm/distributed/device_communicators/all2all.py
+++ b/vllm/distributed/device_communicators/all2all.py
@@ -233,16 +233,11 @@ class DeepEPLLAll2AllManager(DeepEPAll2AllManagerBase):
        # Defaults for internode and intranode are taken from DeepEP tests.
        num_nvl_bytes = 1024 * 1024 * 1024
        num_qps_per_rank = num_local_experts
-        num_rdma_bytes = None
-
-        if self.internode:
-            num_rdma_bytes = 1024 * 1024 * 1024
-        else:
-            num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
-                num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
-                hidden=token_hidden_size,
-                num_ranks=num_ep_ranks,
-                num_experts=num_global_experts)
+        num_rdma_bytes = deep_ep.Buffer.get_low_latency_rdma_size_hint(
+            num_max_dispatch_tokens_per_rank=max_num_tokens_per_dp_rank,
+            hidden=token_hidden_size,
+            num_ranks=num_ep_ranks,
+            num_experts=num_global_experts)

        assert num_rdma_bytes is not None
        return dict(group=self.cpu_group,