[UX] Replace VLLM_ALL2ALL_BACKEND with --all2all-backend (#26732)

Signed-off-by: mgoin <mgoin64@gmail.com>
This commit is contained in:
Michael Goin
2025-10-13 21:12:52 -04:00
committed by GitHub
parent 8317f72354
commit 3e051bda82
12 changed files with 90 additions and 51 deletions

View File

@@ -192,7 +192,7 @@ class CudaPlatformBase(Platform):
compilation_config = vllm_config.compilation_config
if (
envs.VLLM_ALL2ALL_BACKEND == "deepep_high_throughput"
parallel_config.all2all_backend == "deepep_high_throughput"
and parallel_config.data_parallel_size > 1
and compilation_config.cudagraph_mode != CUDAGraphMode.NONE
):
@@ -204,7 +204,7 @@ class CudaPlatformBase(Platform):
"kernels are optimized for prefill and are incompatible with "
"CUDA Graphs. "
"In order to use CUDA Graphs for decode-optimized workloads, "
"set VLLM_ALL2ALL_BACKEND to another option, such as "
"use --all2all-backend with another option, such as "
"deepep_low_latency, pplx, or allgather_reducescatter."
)
compilation_config.cudagraph_mode = CUDAGraphMode.NONE