[Misc] Remove deprecated VLLM_ALL2ALL_BACKEND environment variable (#33535)

Signed-off-by: carlory <baofa.fan@daocloud.io>
Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
杨朱 · Kiki
2026-02-03 15:01:59 +08:00
committed by GitHub
parent 61397891ce
commit b95cc5014d
4 changed files with 2 additions and 47 deletions

View File

@@ -43,7 +43,6 @@ trap cleanup EXIT
for BACK in "${BACKENDS[@]}"; do
VLLM_DEEP_GEMM_WARMUP=skip \
VLLM_ALL2ALL_BACKEND=$BACK \
vllm serve "$MODEL" \
--enforce-eager \
--tensor-parallel-size 2 \
@@ -52,6 +51,7 @@ for BACK in "${BACKENDS[@]}"; do
--enable-eplb \
--trust-remote-code \
--max-model-len 2048 \
--all2all-backend $BACK \
--port $PORT &
SERVER_PID=$!
wait_for_server $PORT

View File

@@ -150,10 +150,7 @@ class Config:
"VLLM_USE_DEEP_GEMM": str(int(self.needs_deep_gemm())),
}
backend = self.all2all_backend()
vllm_config.parallel_config.all2all_backend = backend
if backend is not None:
env_dict.update({"VLLM_ALL2ALL_BACKEND": backend})
vllm_config.parallel_config.all2all_backend = self.all2all_backend()
if self.fused_moe_chunk_size is not None:
env_dict.update(

View File

@@ -549,15 +549,6 @@ class ParallelConfig:
return hash_factors(factors)
def __post_init__(self) -> None:
# Set all2all_backend from env var if not specified, with deprecation warning
if envs.is_set("VLLM_ALL2ALL_BACKEND"):
logger.warning_once(
"VLLM_ALL2ALL_BACKEND environment variable is deprecated and "
"will be removed in v0.15.0. Please use the "
"--all2all-backend command-line argument instead."
)
self.all2all_backend = envs.VLLM_ALL2ALL_BACKEND
# Continue with the rest of the initialization
self.world_size = (
self.pipeline_parallel_size

View File

@@ -171,15 +171,6 @@ if TYPE_CHECKING:
VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5600
VLLM_MOONCAKE_BOOTSTRAP_PORT: int = 8998
VLLM_ALL2ALL_BACKEND: Literal[
"naive",
"pplx",
"deepep_high_throughput",
"deepep_low_latency",
"mori",
"allgather_reducescatter",
"flashinfer_all2allv",
] = "allgather_reducescatter"
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
VLLM_SLEEP_WHEN_IDLE: bool = False
@@ -1292,30 +1283,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
"VLLM_MOONCAKE_BOOTSTRAP_PORT": lambda: int(
os.getenv("VLLM_MOONCAKE_BOOTSTRAP_PORT", "8998")
),
# [DEPRECATED - will be removed in v0.15.0] all2all backend for vllm's
# expert parallel communication. Use --all2all-backend CLI argument instead.
# Available options:
# - "naive": naive all2all implementation using broadcasts
# - "allgather_reducescatter": all2all implementation based on allgather and
# reducescatter
# - "pplx": use pplx kernels
# - "deepep_high_throughput", use deepep high-throughput kernels
# - "deepep_low_latency", use deepep low-latency kernels
# - "mori", use MoRI kernels
# - "flashinfer_all2allv", use flashinfer alltoallv kernels for mnnvl
"VLLM_ALL2ALL_BACKEND": env_with_choices(
"VLLM_ALL2ALL_BACKEND",
None,
[
"naive",
"pplx",
"deepep_high_throughput",
"deepep_low_latency",
"mori",
"allgather_reducescatter",
"flashinfer_all2allv",
],
),
# Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support.
# Both require compute capability 10.0 or above.
# Available options: