diff --git a/vllm/distributed/eplb/eplb_utils.py b/vllm/distributed/eplb/eplb_utils.py new file mode 100644 index 000000000..455848341 --- /dev/null +++ b/vllm/distributed/eplb/eplb_utils.py @@ -0,0 +1,54 @@ +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +"""Utility functions for EPLB (Expert Parallel Load Balancing).""" + +import os + +from vllm.config import ParallelConfig +from vllm.logger import init_logger + +logger = init_logger(__name__) + + +def override_envs_for_eplb(parallel_config: ParallelConfig) -> None: + """ + Override environment variables for EPLB when specific conditions are met. + + Args: + parallel_config: The parallel configuration object. + """ + is_data_parallel = parallel_config.data_parallel_size > 1 + is_eplb_enabled = parallel_config.enable_eplb + async_eplb = parallel_config.eplb_config.use_async + is_deepep_ll = parallel_config.all2all_backend == "deepep_low_latency" + + # Override NCCL_MAX_CTAS to avoid hangs when using async EPLB with the + # DeepEP low-latency backend. + # + # The hang happens when two ranks interleave kernel launches differently + # between NCCL collectives (used by async EPLB weight exchange) and DeepEP + # low-latency (LL) kernels. DeepEP LL uses a cooperative launch and tries + # to reserve a large fraction of the GPU's SMs; if those SMs are currently + # occupied by NCCL, the DeepEP LL launch blocks until enough SMs are + # freed. + # + # If rank A enters DeepEP LL in main thread while rank B is still executing + # NCCL in async thread, rank A can block waiting for SMs, while rank B can + # block inside NCCL waiting for rank A to participate in the collective. + # This circular wait causes a deadlock. + # Limiting NCCL occupancy via NCCL_MAX_CTAS leaves space for the DeepEP + # cooperative kernel to launch and complete, breaking the deadlock. + # See: https://github.com/deepseek-ai/DeepEP/issues/496 + if is_data_parallel and is_eplb_enabled and is_deepep_ll and async_eplb: + current_value_str = os.getenv("NCCL_MAX_CTAS") + + if current_value_str and current_value_str.isdigit(): + return + + override_value = 8 + os.environ["NCCL_MAX_CTAS"] = str(override_value) + logger.info_once( + f"EPLB: Setting NCCL_MAX_CTAS={override_value} " + "for expert parallel with EPLB and deepep_low_latency backend", + scope="global", + ) diff --git a/vllm/v1/worker/gpu_worker.py b/vllm/v1/worker/gpu_worker.py index 1c526bab9..2b7d9ff29 100644 --- a/vllm/v1/worker/gpu_worker.py +++ b/vllm/v1/worker/gpu_worker.py @@ -22,6 +22,7 @@ from vllm.distributed import ( set_custom_all_reduce, ) from vllm.distributed.ec_transfer import ensure_ec_transfer_initialized +from vllm.distributed.eplb.eplb_utils import override_envs_for_eplb from vllm.distributed.kv_transfer import ( ensure_kv_transfer_initialized, ensure_kv_transfer_shutdown, @@ -1035,6 +1036,7 @@ def init_worker_distributed_environment( from vllm.model_executor.layers.batch_invariant import init_batch_invariance init_batch_invariance(attention_config.backend) + override_envs_for_eplb(parallel_config) set_custom_all_reduce(not parallel_config.disable_custom_all_reduce) init_method = distributed_init_method or "env://"