From 14acf429ac08b6d538ca6feb3e06b6d13895804d Mon Sep 17 00:00:00 2001 From: Ilya Markov Date: Tue, 24 Mar 2026 12:50:44 +0100 Subject: [PATCH] [EPLB] Remove main waits in case of slow EPLB (#36271) Signed-off-by: ilmarkov --- vllm/distributed/eplb/async_worker.py | 6 +++--- vllm/distributed/eplb/eplb_state.py | 11 ----------- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/vllm/distributed/eplb/async_worker.py b/vllm/distributed/eplb/async_worker.py index 781465869..7cb8805f4 100644 --- a/vllm/distributed/eplb/async_worker.py +++ b/vllm/distributed/eplb/async_worker.py @@ -160,9 +160,9 @@ async def transfer_run_periodically( is_profile=is_profile, cuda_stream=cuda_stream, ) - event = torch.cuda.Event(blocking=False) - cuda_stream.record_event(event) - model_state.buffer_ready_event = event + # block the async thread until the transfer to + # the intermediate buffer is complete. + cuda_stream.synchronize() model_state.ep_buffer_ready = 1 finally: model_state.buffer_lock.release() diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py index 6081ccca4..7c54f28b4 100644 --- a/vllm/distributed/eplb/eplb_state.py +++ b/vllm/distributed/eplb/eplb_state.py @@ -176,11 +176,6 @@ class EplbModelState: """ The lock to protect the expert buffer. """ - buffer_ready_event: torch.cuda.Event | None - """ - CUDA event recorded when the async worker finishes filling the buffer. - The main thread waits on this before consuming the buffer. - """ buffer_consumed_event: torch.cuda.Event | None """ CUDA event recorded after the main thread finishes consuming the buffer. @@ -480,7 +475,6 @@ class EplbState: model=model, expert_buffer=expert_buffer, buffer_lock=threading.Lock(), - buffer_ready_event=None, buffer_consumed_event=None, window_ready_event=None, ep_buffer_ready=0, @@ -919,11 +913,6 @@ class EplbState: ) try: assert model_state.new_physical_to_logical_map is not None - device_index = model_state.cuda_device_index or self.cuda_device_index - if model_state.buffer_ready_event is not None and device_index is not None: - stream = torch.cuda.current_stream(device=device_index) - stream.wait_event(model_state.buffer_ready_event) - model_state.buffer_ready_event = None expert_weights = model_state.model.expert_weights[ model_state.layer_to_transfer ]