[EPLB] Remove main waits in case of slow EPLB (#36271)
Signed-off-by: ilmarkov <markovilya197@gmail.com>
This commit is contained in:
@@ -160,9 +160,9 @@ async def transfer_run_periodically(
|
||||
is_profile=is_profile,
|
||||
cuda_stream=cuda_stream,
|
||||
)
|
||||
event = torch.cuda.Event(blocking=False)
|
||||
cuda_stream.record_event(event)
|
||||
model_state.buffer_ready_event = event
|
||||
# block the async thread until the transfer to
|
||||
# the intermediate buffer is complete.
|
||||
cuda_stream.synchronize()
|
||||
model_state.ep_buffer_ready = 1
|
||||
finally:
|
||||
model_state.buffer_lock.release()
|
||||
|
||||
@@ -176,11 +176,6 @@ class EplbModelState:
|
||||
"""
|
||||
The lock to protect the expert buffer.
|
||||
"""
|
||||
buffer_ready_event: torch.cuda.Event | None
|
||||
"""
|
||||
CUDA event recorded when the async worker finishes filling the buffer.
|
||||
The main thread waits on this before consuming the buffer.
|
||||
"""
|
||||
buffer_consumed_event: torch.cuda.Event | None
|
||||
"""
|
||||
CUDA event recorded after the main thread finishes consuming the buffer.
|
||||
@@ -480,7 +475,6 @@ class EplbState:
|
||||
model=model,
|
||||
expert_buffer=expert_buffer,
|
||||
buffer_lock=threading.Lock(),
|
||||
buffer_ready_event=None,
|
||||
buffer_consumed_event=None,
|
||||
window_ready_event=None,
|
||||
ep_buffer_ready=0,
|
||||
@@ -919,11 +913,6 @@ class EplbState:
|
||||
)
|
||||
try:
|
||||
assert model_state.new_physical_to_logical_map is not None
|
||||
device_index = model_state.cuda_device_index or self.cuda_device_index
|
||||
if model_state.buffer_ready_event is not None and device_index is not None:
|
||||
stream = torch.cuda.current_stream(device=device_index)
|
||||
stream.wait_event(model_state.buffer_ready_event)
|
||||
model_state.buffer_ready_event = None
|
||||
expert_weights = model_state.model.expert_weights[
|
||||
model_state.layer_to_transfer
|
||||
]
|
||||
|
||||
Reference in New Issue
Block a user