From c9a533079cc8c991c527eab029dbc990b4dc9d5d Mon Sep 17 00:00:00 2001 From: Ilya Markov Date: Fri, 16 Jan 2026 15:11:01 +0100 Subject: [PATCH] [EPLB][BugFix]Possible deadlock fix (#32418) Signed-off-by: ilmarkov Signed-off-by: Tyler Michael Smith Co-authored-by: Tyler Michael Smith --- vllm/distributed/eplb/eplb_state.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 26571cd80..424c2235c 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -970,8 +970,23 @@ class EplbState: ep_group: ProcessGroup, is_profile: bool = False, ): - if not model_state.buffer_lock.acquire(blocking=False): - return + # We call move_to_workspace only when ep_buffer_ready is 1. + # It means we only need to wait for the lock for a short time. + max_retries = 6 # 1 minute max + retries = 0 + while not model_state.buffer_lock.acquire(blocking=True, timeout=10.0): + retries += 1 + if retries >= max_retries: + raise RuntimeError( + f"Rank {ep_group.rank()}: buffer_lock timeout after " + "{max_retries * 10}s" + ) + logger.warning( + "Rank %d: EPLB buffer_lock acquire failed, retrying (%d/%d)", + ep_group.rank(), + retries, + max_retries, + ) try: assert model_state.new_physical_to_logical_map is not None device_index = model_state.cuda_device_index or self.cuda_device_index