Remove all2all backend envvar (#30363)

Signed-off-by: Elizabeth Thomas <email2eliza@gmail.com> Signed-off-by: Harry Mellor <19981378+hmellor@users.noreply.github.com> Co-authored-by: Harry Mellor <19981378+hmellor@users.noreply.github.com>
2025-12-18 13:46:28 -06:00
parent 97000a2be7
commit 41b6f9200f
12 changed files with 40 additions and 43 deletions
--- a/vllm/config/compilation.py
+++ b/vllm/config/compilation.py
@@ -899,7 +899,7 @@ class CompilationConfig:
        self.compute_bs_to_padded_graph_size()

    def set_splitting_ops_for_v1(
-        self, all2all_backend: str | None = None, data_parallel_size: int | None = None
+        self, all2all_backend: str, data_parallel_size: int = 1
    ):
        # To compatible with OOT hardware plugin platform (for example vllm-ascend)
        # which currently only supports sequence parallelism in eager mode.
@@ -956,11 +956,9 @@ class CompilationConfig:
                self.splitting_ops = []

        # Disable CUDA graphs for DeepEP high-throughput since its not CG compatible
-        backend = all2all_backend or envs.VLLM_ALL2ALL_BACKEND
-        dp_size = data_parallel_size if data_parallel_size is not None else 1
        if (
-            backend == "deepep_high_throughput"
-            and dp_size > 1
+            all2all_backend == "deepep_high_throughput"
+            and data_parallel_size > 1
            and self.cudagraph_mode != CUDAGraphMode.NONE
        ):
            # TODO: Piecewise Cuda graph might be enabled