From 14acf429ac08b6d538ca6feb3e06b6d13895804d Mon Sep 17 00:00:00 2001
From: Ilya Markov <markovilya197@gmail.com>
Date: Tue, 24 Mar 2026 12:50:44 +0100
Subject: [PATCH] [EPLB] Remove main waits in case of slow EPLB (#36271)

Signed-off-by: ilmarkov <markovilya197@gmail.com>
---
 vllm/distributed/eplb/async_worker.py |  6 +++---
 vllm/distributed/eplb/eplb_state.py   | 11 -----------
 2 files changed, 3 insertions(+), 14 deletions(-)

diff --git a/vllm/distributed/eplb/async_worker.py b/vllm/distributed/eplb/async_worker.py
index 781465869..7cb8805f4 100644
--- a/vllm/distributed/eplb/async_worker.py
+++ b/vllm/distributed/eplb/async_worker.py
@@ -160,9 +160,9 @@ async def transfer_run_periodically(
                             is_profile=is_profile,
                             cuda_stream=cuda_stream,
                         )
-                        event = torch.cuda.Event(blocking=False)
-                        cuda_stream.record_event(event)
-                        model_state.buffer_ready_event = event
+                        # block the async thread until the transfer to
+                        # the intermediate buffer is complete.
+                        cuda_stream.synchronize()
                         model_state.ep_buffer_ready = 1
                     finally:
                         model_state.buffer_lock.release()
diff --git a/vllm/distributed/eplb/eplb_state.py b/vllm/distributed/eplb/eplb_state.py
index 6081ccca4..7c54f28b4 100644
--- a/vllm/distributed/eplb/eplb_state.py
+++ b/vllm/distributed/eplb/eplb_state.py
@@ -176,11 +176,6 @@ class EplbModelState:
     """
     The lock to protect the expert buffer.
     """
-    buffer_ready_event: torch.cuda.Event | None
-    """
-    CUDA event recorded when the async worker finishes filling the buffer.
-    The main thread waits on this before consuming the buffer.
-    """
     buffer_consumed_event: torch.cuda.Event | None
     """
     CUDA event recorded after the main thread finishes consuming the buffer.
@@ -480,7 +475,6 @@ class EplbState:
             model=model,
             expert_buffer=expert_buffer,
             buffer_lock=threading.Lock(),
-            buffer_ready_event=None,
             buffer_consumed_event=None,
             window_ready_event=None,
             ep_buffer_ready=0,
@@ -919,11 +913,6 @@ class EplbState:
             )
         try:
             assert model_state.new_physical_to_logical_map is not None
-            device_index = model_state.cuda_device_index or self.cuda_device_index
-            if model_state.buffer_ready_event is not None and device_index is not None:
-                stream = torch.cuda.current_stream(device=device_index)
-                stream.wait_event(model_state.buffer_ready_event)
-                model_state.buffer_ready_event = None
             expert_weights = model_state.model.expert_weights[
                 model_state.layer_to_transfer
             ]