[EPLB] Enforce sync eplb for NCCL-based all2all backend (#35212)
Signed-off-by: ilmarkov <markovilya197@gmail.com>
This commit is contained in:
@@ -774,6 +774,17 @@ class ParallelConfig:
|
||||
"backend is mp, uni or external_launcher."
|
||||
)
|
||||
|
||||
if (
|
||||
self.all2all_backend in ("allgather_reducescatter", "naive")
|
||||
and self.eplb_config.use_async
|
||||
):
|
||||
logger.warning(
|
||||
"Async EPLB causes hangs with the '%s' all2all backend. "
|
||||
"Forcing synchronous EPLB.",
|
||||
self.all2all_backend,
|
||||
)
|
||||
self.eplb_config.use_async = False
|
||||
|
||||
@property
|
||||
def use_ray(self) -> bool:
|
||||
return self.distributed_executor_backend == "ray" or (
|
||||
|
||||
Reference in New Issue
Block a user