[EPLB][BugFix]Possible deadlock fix (#32418)
Signed-off-by: ilmarkov <markovilya197@gmail.com> Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com> Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
This commit is contained in:
@@ -970,8 +970,23 @@ class EplbState:
|
||||
ep_group: ProcessGroup,
|
||||
is_profile: bool = False,
|
||||
):
|
||||
if not model_state.buffer_lock.acquire(blocking=False):
|
||||
return
|
||||
# We call move_to_workspace only when ep_buffer_ready is 1.
|
||||
# It means we only need to wait for the lock for a short time.
|
||||
max_retries = 6 # 1 minute max
|
||||
retries = 0
|
||||
while not model_state.buffer_lock.acquire(blocking=True, timeout=10.0):
|
||||
retries += 1
|
||||
if retries >= max_retries:
|
||||
raise RuntimeError(
|
||||
f"Rank {ep_group.rank()}: buffer_lock timeout after "
|
||||
"{max_retries * 10}s"
|
||||
)
|
||||
logger.warning(
|
||||
"Rank %d: EPLB buffer_lock acquire failed, retrying (%d/%d)",
|
||||
ep_group.rank(),
|
||||
retries,
|
||||
max_retries,
|
||||
)
|
||||
try:
|
||||
assert model_state.new_physical_to_logical_map is not None
|
||||
device_index = model_state.cuda_device_index or self.cuda_device_index
|
||||
|
||||
Reference in New Issue
Block a user