[EPLB][BugFix]Possible deadlock fix (#32418)

Signed-off-by: ilmarkov <markovilya197@gmail.com>
Signed-off-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
Co-authored-by: Tyler Michael Smith <tlrmchlsmth@gmail.com>
This commit is contained in:
Ilya Markov
2026-01-16 15:11:01 +01:00
committed by GitHub
parent 6ca4f400d8
commit c9a533079c

View File

@@ -970,8 +970,23 @@ class EplbState:
ep_group: ProcessGroup,
is_profile: bool = False,
):
if not model_state.buffer_lock.acquire(blocking=False):
return
# We call move_to_workspace only when ep_buffer_ready is 1.
# It means we only need to wait for the lock for a short time.
max_retries = 6 # 1 minute max
retries = 0
while not model_state.buffer_lock.acquire(blocking=True, timeout=10.0):
retries += 1
if retries >= max_retries:
raise RuntimeError(
f"Rank {ep_group.rank()}: buffer_lock timeout after "
"{max_retries * 10}s"
)
logger.warning(
"Rank %d: EPLB buffer_lock acquire failed, retrying (%d/%d)",
ep_group.rank(),
retries,
max_retries,
)
try:
assert model_state.new_physical_to_logical_map is not None
device_index = model_state.cuda_device_index or self.cuda_device_index