[BugFix] Fix async EPLB hang with DeepEP LL all2all backend (#32860)

Signed-off-by: ilmarkov <markovilya197@gmail.com>
This commit is contained in:
Ilya Markov
2026-02-10 23:34:47 +01:00
committed by GitHub
parent 67132945bb
commit bb2fc8b5e7
2 changed files with 56 additions and 0 deletions

View File

@@ -0,0 +1,54 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
"""Utility functions for EPLB (Expert Parallel Load Balancing)."""
import os
from vllm.config import ParallelConfig
from vllm.logger import init_logger
logger = init_logger(__name__)
def override_envs_for_eplb(parallel_config: ParallelConfig) -> None:
"""
Override environment variables for EPLB when specific conditions are met.
Args:
parallel_config: The parallel configuration object.
"""
is_data_parallel = parallel_config.data_parallel_size > 1
is_eplb_enabled = parallel_config.enable_eplb
async_eplb = parallel_config.eplb_config.use_async
is_deepep_ll = parallel_config.all2all_backend == "deepep_low_latency"
# Override NCCL_MAX_CTAS to avoid hangs when using async EPLB with the
# DeepEP low-latency backend.
#
# The hang happens when two ranks interleave kernel launches differently
# between NCCL collectives (used by async EPLB weight exchange) and DeepEP
# low-latency (LL) kernels. DeepEP LL uses a cooperative launch and tries
# to reserve a large fraction of the GPU's SMs; if those SMs are currently
# occupied by NCCL, the DeepEP LL launch blocks until enough SMs are
# freed.
#
# If rank A enters DeepEP LL in main thread while rank B is still executing
# NCCL in async thread, rank A can block waiting for SMs, while rank B can
# block inside NCCL waiting for rank A to participate in the collective.
# This circular wait causes a deadlock.
# Limiting NCCL occupancy via NCCL_MAX_CTAS leaves space for the DeepEP
# cooperative kernel to launch and complete, breaking the deadlock.
# See: https://github.com/deepseek-ai/DeepEP/issues/496
if is_data_parallel and is_eplb_enabled and is_deepep_ll and async_eplb:
current_value_str = os.getenv("NCCL_MAX_CTAS")
if current_value_str and current_value_str.isdigit():
return
override_value = 8
os.environ["NCCL_MAX_CTAS"] = str(override_value)
logger.info_once(
f"EPLB: Setting NCCL_MAX_CTAS={override_value} "
"for expert parallel with EPLB and deepep_low_latency backend",
scope="global",
)

View File

@@ -22,6 +22,7 @@ from vllm.distributed import (
set_custom_all_reduce,
)
from vllm.distributed.ec_transfer import ensure_ec_transfer_initialized
from vllm.distributed.eplb.eplb_utils import override_envs_for_eplb
from vllm.distributed.kv_transfer import (
ensure_kv_transfer_initialized,
ensure_kv_transfer_shutdown,
@@ -1035,6 +1036,7 @@ def init_worker_distributed_environment(
from vllm.model_executor.layers.batch_invariant import init_batch_invariance
init_batch_invariance(attention_config.backend)
override_envs_for_eplb(parallel_config)
set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
init_method = distributed_init_method or "env://"