[Misc] Remove deprecated VLLM_ALL2ALL_BACKEND environment variable (#33535)
Signed-off-by: carlory <baofa.fan@daocloud.io> Co-authored-by: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
@@ -43,7 +43,6 @@ trap cleanup EXIT
|
||||
|
||||
for BACK in "${BACKENDS[@]}"; do
|
||||
VLLM_DEEP_GEMM_WARMUP=skip \
|
||||
VLLM_ALL2ALL_BACKEND=$BACK \
|
||||
vllm serve "$MODEL" \
|
||||
--enforce-eager \
|
||||
--tensor-parallel-size 2 \
|
||||
@@ -52,6 +51,7 @@ for BACK in "${BACKENDS[@]}"; do
|
||||
--enable-eplb \
|
||||
--trust-remote-code \
|
||||
--max-model-len 2048 \
|
||||
--all2all-backend $BACK \
|
||||
--port $PORT &
|
||||
SERVER_PID=$!
|
||||
wait_for_server $PORT
|
||||
|
||||
@@ -150,10 +150,7 @@ class Config:
|
||||
"VLLM_USE_DEEP_GEMM": str(int(self.needs_deep_gemm())),
|
||||
}
|
||||
|
||||
backend = self.all2all_backend()
|
||||
vllm_config.parallel_config.all2all_backend = backend
|
||||
if backend is not None:
|
||||
env_dict.update({"VLLM_ALL2ALL_BACKEND": backend})
|
||||
vllm_config.parallel_config.all2all_backend = self.all2all_backend()
|
||||
|
||||
if self.fused_moe_chunk_size is not None:
|
||||
env_dict.update(
|
||||
|
||||
@@ -549,15 +549,6 @@ class ParallelConfig:
|
||||
return hash_factors(factors)
|
||||
|
||||
def __post_init__(self) -> None:
|
||||
# Set all2all_backend from env var if not specified, with deprecation warning
|
||||
if envs.is_set("VLLM_ALL2ALL_BACKEND"):
|
||||
logger.warning_once(
|
||||
"VLLM_ALL2ALL_BACKEND environment variable is deprecated and "
|
||||
"will be removed in v0.15.0. Please use the "
|
||||
"--all2all-backend command-line argument instead."
|
||||
)
|
||||
self.all2all_backend = envs.VLLM_ALL2ALL_BACKEND
|
||||
|
||||
# Continue with the rest of the initialization
|
||||
self.world_size = (
|
||||
self.pipeline_parallel_size
|
||||
|
||||
33
vllm/envs.py
33
vllm/envs.py
@@ -171,15 +171,6 @@ if TYPE_CHECKING:
|
||||
VLLM_NIXL_SIDE_CHANNEL_HOST: str = "localhost"
|
||||
VLLM_NIXL_SIDE_CHANNEL_PORT: int = 5600
|
||||
VLLM_MOONCAKE_BOOTSTRAP_PORT: int = 8998
|
||||
VLLM_ALL2ALL_BACKEND: Literal[
|
||||
"naive",
|
||||
"pplx",
|
||||
"deepep_high_throughput",
|
||||
"deepep_low_latency",
|
||||
"mori",
|
||||
"allgather_reducescatter",
|
||||
"flashinfer_all2allv",
|
||||
] = "allgather_reducescatter"
|
||||
VLLM_MAX_TOKENS_PER_EXPERT_FP4_MOE: int = 163840
|
||||
VLLM_TOOL_PARSE_REGEX_TIMEOUT_SECONDS: int = 1
|
||||
VLLM_SLEEP_WHEN_IDLE: bool = False
|
||||
@@ -1292,30 +1283,6 @@ environment_variables: dict[str, Callable[[], Any]] = {
|
||||
"VLLM_MOONCAKE_BOOTSTRAP_PORT": lambda: int(
|
||||
os.getenv("VLLM_MOONCAKE_BOOTSTRAP_PORT", "8998")
|
||||
),
|
||||
# [DEPRECATED - will be removed in v0.15.0] all2all backend for vllm's
|
||||
# expert parallel communication. Use --all2all-backend CLI argument instead.
|
||||
# Available options:
|
||||
# - "naive": naive all2all implementation using broadcasts
|
||||
# - "allgather_reducescatter": all2all implementation based on allgather and
|
||||
# reducescatter
|
||||
# - "pplx": use pplx kernels
|
||||
# - "deepep_high_throughput", use deepep high-throughput kernels
|
||||
# - "deepep_low_latency", use deepep low-latency kernels
|
||||
# - "mori", use MoRI kernels
|
||||
# - "flashinfer_all2allv", use flashinfer alltoallv kernels for mnnvl
|
||||
"VLLM_ALL2ALL_BACKEND": env_with_choices(
|
||||
"VLLM_ALL2ALL_BACKEND",
|
||||
None,
|
||||
[
|
||||
"naive",
|
||||
"pplx",
|
||||
"deepep_high_throughput",
|
||||
"deepep_low_latency",
|
||||
"mori",
|
||||
"allgather_reducescatter",
|
||||
"flashinfer_all2allv",
|
||||
],
|
||||
),
|
||||
# Flashinfer MoE backend for vLLM's fused Mixture-of-Experts support.
|
||||
# Both require compute capability 10.0 or above.
|
||||
# Available options:
|
||||
|
||||
Reference in New Issue
Block a user