[Misc] Remove deprecated VLLM_ALL2ALL_BACKEND environment variable (#33535)

Signed-off-by: carlory <baofa.fan@daocloud.io> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
2026-02-03 15:01:59 +08:00
parent 61397891ce
commit b95cc5014d
4 changed files with 2 additions and 47 deletions
--- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
+++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh
@@ -43,7 +43,6 @@ trap cleanup EXIT

 for BACK in "${BACKENDS[@]}"; do
  VLLM_DEEP_GEMM_WARMUP=skip \
-  VLLM_ALL2ALL_BACKEND=$BACK \
  vllm serve "$MODEL" \
    --enforce-eager \
    --tensor-parallel-size 2 \
@@ -52,6 +51,7 @@ for BACK in "${BACKENDS[@]}"; do
    --enable-eplb \
    --trust-remote-code \
    --max-model-len 2048 \
+    --all2all-backend $BACK \
    --port $PORT &
  SERVER_PID=$!
  wait_for_server $PORT
--- a/tests/kernels/moe/modular_kernel_tools/common.py
+++ b/tests/kernels/moe/modular_kernel_tools/common.py
@@ -150,10 +150,7 @@ class Config:
            "VLLM_USE_DEEP_GEMM": str(int(self.needs_deep_gemm())),
        }

-        backend = self.all2all_backend()
-        vllm_config.parallel_config.all2all_backend = backend
-        if backend is not None:
-            env_dict.update({"VLLM_ALL2ALL_BACKEND": backend})
+        vllm_config.parallel_config.all2all_backend = self.all2all_backend()

        if self.fused_moe_chunk_size is not None:
            env_dict.update(
--- a/vllm/config/parallel.py
+++ b/vllm/config/parallel.py
@@ -549,15 +549,6 @@ class ParallelConfig:
        return hash_factors(factors)

    def __post_init__(self) -> None:
-        # Set all2all_backend from env var if not specified, with deprecation warning
-        if envs.is_set("VLLM_ALL2ALL_BACKEND"):
-            logger.warning_once(
-                "VLLM_ALL2ALL_BACKEND environment variable is deprecated and "
-                "will be removed in v0.15.0. Please use the "
-                "--all2all-backend command-line argument instead."
-            )
-            self.all2all_backend = envs.VLLM_ALL2ALL_BACKEND
-
        # Continue with the rest of the initialization
        self.world_size = (
            self.pipeline_parallel_size
--- a/vllm/envs.py
+++ b/vllm/envs.py
@@ -171,15 +171,6 @@ if TYPE_CHECKING:
    VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
    VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5600
    VLLM_MOONCAKE_BOOTSTRAP_PORT: int = 8998
-    VLLM_ALL2ALL_BACKEND: Literal[
-        "naive",
-        "pplx",
-        "deepep_high_throughput",
-        "deepep_low_latency",
-        "mori",
-        "allgather_reducescatter",
-        "flashinfer_all2allv",
-    ] = "allgather_reducescatter"
    VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
    VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
    VLLM_SLEEP_WHEN_IDLE: bool = False
@@ -1292,30 +1283,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
    "VLLM_MOONCAKE_BOOTSTRAP_PORT": lambda: int(
        os.getenv("VLLM_MOONCAKE_BOOTSTRAP_PORT", "8998")
    ),
-    # [DEPRECATED - will be removed in v0.15.0] all2all backend for vllm's
-    # expert parallel communication. Use --all2all-backend CLI argument instead.
-    # Available options:
-    # - "naive": naive all2all implementation using broadcasts
-    # - "allgather_reducescatter": all2all implementation based on allgather and
-    #  reducescatter
-    # - "pplx": use pplx kernels
-    # - "deepep_high_throughput", use deepep high-throughput kernels
-    # - "deepep_low_latency", use deepep low-latency kernels
-    # - "mori", use MoRI kernels
-    # - "flashinfer_all2allv", use flashinfer alltoallv kernels for mnnvl
-    "VLLM_ALL2ALL_BACKEND": env_with_choices(
-        "VLLM_ALL2ALL_BACKEND",
-        None,
-        [
-            "naive",
-            "pplx",
-            "deepep_high_throughput",
-            "deepep_low_latency",
-            "mori",
-            "allgather_reducescatter",
-            "flashinfer_all2allv",
-        ],
-    ),
    # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support.
    # Both require compute capability 10.0 or above.
    # Available options: