[BugFix] Fix async EPLB hang with DeepEP LL all2all backend (#32860)
Signed-off-by: ilmarkov <markovilya197@gmail.com>
This commit is contained in:
54
vllm/distributed/eplb/eplb_utils.py
Normal file
54
vllm/distributed/eplb/eplb_utils.py
Normal file
@@ -0,0 +1,54 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
"""Utility functions for EPLB (Expert Parallel Load Balancing)."""
|
||||
|
||||
import os
|
||||
|
||||
from vllm.config import ParallelConfig
|
||||
from vllm.logger import init_logger
|
||||
|
||||
logger = init_logger(__name__)
|
||||
|
||||
|
||||
def override_envs_for_eplb(parallel_config: ParallelConfig) -> None:
|
||||
"""
|
||||
Override environment variables for EPLB when specific conditions are met.
|
||||
|
||||
Args:
|
||||
parallel_config: The parallel configuration object.
|
||||
"""
|
||||
is_data_parallel = parallel_config.data_parallel_size > 1
|
||||
is_eplb_enabled = parallel_config.enable_eplb
|
||||
async_eplb = parallel_config.eplb_config.use_async
|
||||
is_deepep_ll = parallel_config.all2all_backend == "deepep_low_latency"
|
||||
|
||||
# Override NCCL_MAX_CTAS to avoid hangs when using async EPLB with the
|
||||
# DeepEP low-latency backend.
|
||||
#
|
||||
# The hang happens when two ranks interleave kernel launches differently
|
||||
# between NCCL collectives (used by async EPLB weight exchange) and DeepEP
|
||||
# low-latency (LL) kernels. DeepEP LL uses a cooperative launch and tries
|
||||
# to reserve a large fraction of the GPU's SMs; if those SMs are currently
|
||||
# occupied by NCCL, the DeepEP LL launch blocks until enough SMs are
|
||||
# freed.
|
||||
#
|
||||
# If rank A enters DeepEP LL in main thread while rank B is still executing
|
||||
# NCCL in async thread, rank A can block waiting for SMs, while rank B can
|
||||
# block inside NCCL waiting for rank A to participate in the collective.
|
||||
# This circular wait causes a deadlock.
|
||||
# Limiting NCCL occupancy via NCCL_MAX_CTAS leaves space for the DeepEP
|
||||
# cooperative kernel to launch and complete, breaking the deadlock.
|
||||
# See: https://github.com/deepseek-ai/DeepEP/issues/496
|
||||
if is_data_parallel and is_eplb_enabled and is_deepep_ll and async_eplb:
|
||||
current_value_str = os.getenv("NCCL_MAX_CTAS")
|
||||
|
||||
if current_value_str and current_value_str.isdigit():
|
||||
return
|
||||
|
||||
override_value = 8
|
||||
os.environ["NCCL_MAX_CTAS"] = str(override_value)
|
||||
logger.info_once(
|
||||
f"EPLB: Setting NCCL_MAX_CTAS={override_value} "
|
||||
"for expert parallel with EPLB and deepep_low_latency backend",
|
||||
scope="global",
|
||||
)
|
||||
@@ -22,6 +22,7 @@ from vllm.distributed import (
|
||||
set_custom_all_reduce,
|
||||
)
|
||||
from vllm.distributed.ec_transfer import ensure_ec_transfer_initialized
|
||||
from vllm.distributed.eplb.eplb_utils import override_envs_for_eplb
|
||||
from vllm.distributed.kv_transfer import (
|
||||
ensure_kv_transfer_initialized,
|
||||
ensure_kv_transfer_shutdown,
|
||||
@@ -1035,6 +1036,7 @@ def init_worker_distributed_environment(
|
||||
from vllm.model_executor.layers.batch_invariant import init_batch_invariance
|
||||
|
||||
init_batch_invariance(attention_config.backend)
|
||||
override_envs_for_eplb(parallel_config)
|
||||
set_custom_all_reduce(not parallel_config.disable_custom_all_reduce)
|
||||
|
||||
init_method = distributed_init_method or "env://"
|
||||
|
||||
Reference in New Issue
Block a user