[EPLB] Enforce sync eplb for NCCL-based all2all backend (#35212)
Signed-off-by: ilmarkov <markovilya197@gmail.com>
This commit is contained in:
@@ -774,6 +774,17 @@ class ParallelConfig:
|
|||||||
"backend is mp, uni or external_launcher."
|
"backend is mp, uni or external_launcher."
|
||||||
)
|
)
|
||||||
|
|
||||||
|
if (
|
||||||
|
self.all2all_backend in ("allgather_reducescatter", "naive")
|
||||||
|
and self.eplb_config.use_async
|
||||||
|
):
|
||||||
|
logger.warning(
|
||||||
|
"Async EPLB causes hangs with the '%s' all2all backend. "
|
||||||
|
"Forcing synchronous EPLB.",
|
||||||
|
self.all2all_backend,
|
||||||
|
)
|
||||||
|
self.eplb_config.use_async = False
|
||||||
|
|
||||||
@property
|
@property
|
||||||
def use_ray(self) -> bool:
|
def use_ray(self) -> bool:
|
||||||
return self.distributed_executor_backend == "ray" or (
|
return self.distributed_executor_backend == "ray" or (
|
||||||
|
|||||||
Reference in New Issue
Block a user