diff --git a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh index 8106f50f1..463969cbc 100644 --- a/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh +++ b/.buildkite/scripts/scheduled_integration_test/deepseek_v2_lite_ep_eplb.sh @@ -43,7 +43,6 @@ trap cleanup EXIT for BACK in "${BACKENDS[@]}"; do VLLM_DEEP_GEMM_WARMUP=skip \ - VLLM_ALL2ALL_BACKEND=$BACK \ vllm serve "$MODEL" \ --enforce-eager \ --tensor-parallel-size 2 \ @@ -52,6 +51,7 @@ for BACK in "${BACKENDS[@]}"; do --enable-eplb \ --trust-remote-code \ --max-model-len 2048 \ + --all2all-backend $BACK \ --port $PORT & SERVER_PID=$! wait_for_server $PORT diff --git a/tests/kernels/moe/modular_kernel_tools/common.py b/tests/kernels/moe/modular_kernel_tools/common.py index 4ee18e342..327cd44f6 100644 --- a/tests/kernels/moe/modular_kernel_tools/common.py +++ b/tests/kernels/moe/modular_kernel_tools/common.py @@ -150,10 +150,7 @@ class Config: "VLLM_USE_DEEP_GEMM": str(int(self.needs_deep_gemm())), } - backend = self.all2all_backend() - vllm_config.parallel_config.all2all_backend = backend - if backend is not None: - env_dict.update({"VLLM_ALL2ALL_BACKEND": backend}) + vllm_config.parallel_config.all2all_backend = self.all2all_backend() if self.fused_moe_chunk_size is not None: env_dict.update( diff --git a/vllm/config/parallel.py b/vllm/config/parallel.py index 4bc12b986..fa1aa0312 100644 --- a/vllm/config/parallel.py +++ b/vllm/config/parallel.py @@ -549,15 +549,6 @@ class ParallelConfig: return hash_factors(factors) def __post_init__(self) -> None: - # Set all2all_backend from env var if not specified, with deprecation warning - if envs.is_set("VLLM_ALL2ALL_BACKEND"): - logger.warning_once( - "VLLM_ALL2ALL_BACKEND environment variable is deprecated and " - "will be removed in v0.15.0. Please use the " - "--all2all-backend command-line argument instead." - ) - self.all2all_backend = envs.VLLM_ALL2ALL_BACKEND - # Continue with the rest of the initialization self.world_size = ( self.pipeline_parallel_size diff --git a/vllm/envs.py b/vllm/envs.py index 5fe65fa75..f9aaa4f38 100755 --- a/vllm/envs.py +++ b/vllm/envs.py @@ -171,15 +171,6 @@ if TYPE_CHECKING: VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost" VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5600 VLLM_MOONCAKE_BOOTSTRAP_PORT: int = 8998 - VLLM_ALL2ALL_BACKEND: Literal[ - "naive", - "pplx", - "deepep_high_throughput", - "deepep_low_latency", - "mori", - "allgather_reducescatter", - "flashinfer_all2allv", - ] = "allgather_reducescatter" VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840 VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1 VLLM_SLEEP_WHEN_IDLE: bool = False @@ -1292,30 +1283,6 @@ environment_variables: dict[str, Callable[[], Any]] = { "VLLM_MOONCAKE_BOOTSTRAP_PORT": lambda: int( os.getenv("VLLM_MOONCAKE_BOOTSTRAP_PORT", "8998") ), - # [DEPRECATED - will be removed in v0.15.0] all2all backend for vllm's - # expert parallel communication. Use --all2all-backend CLI argument instead. - # Available options: - # - "naive": naive all2all implementation using broadcasts - # - "allgather_reducescatter": all2all implementation based on allgather and - # reducescatter - # - "pplx": use pplx kernels - # - "deepep_high_throughput", use deepep high-throughput kernels - # - "deepep_low_latency", use deepep low-latency kernels - # - "mori", use MoRI kernels - # - "flashinfer_all2allv", use flashinfer alltoallv kernels for mnnvl - "VLLM_ALL2ALL_BACKEND": env_with_choices( - "VLLM_ALL2ALL_BACKEND", - None, - [ - "naive", - "pplx", - "deepep_high_throughput", - "deepep_low_latency", - "mori", - "allgather_reducescatter", - "flashinfer_all2allv", - ], - ), # Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support. # Both require compute capability 10.0 or above. # Available options: