[V1][DP] More robust DP/EP dummy request coordination (#16277)
Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
@@ -325,7 +325,7 @@ class EngineCoreProc(EngineCore):
|
||||
|
||||
self.step_fn = (self.step if self.batch_queue is None else
|
||||
self.step_with_batch_queue)
|
||||
self.global_unfinished_reqs = False
|
||||
self.engines_running = False
|
||||
|
||||
# Background Threads and Queues for IO. These enable us to
|
||||
# overlap ZMQ socket IO with GPU since they release the GIL,
|
||||
@@ -410,8 +410,7 @@ class EngineCoreProc(EngineCore):
|
||||
"""Exits when an engine step needs to be performed."""
|
||||
|
||||
waited = False
|
||||
while not self.global_unfinished_reqs and not (
|
||||
self.scheduler.has_requests()):
|
||||
while not self.engines_running and not (self.scheduler.has_requests()):
|
||||
if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
|
||||
logger.debug("EngineCore waiting for work.")
|
||||
waited = True
|
||||
@@ -419,10 +418,7 @@ class EngineCoreProc(EngineCore):
|
||||
self._handle_client_request(*req)
|
||||
|
||||
if waited:
|
||||
logger.debug(
|
||||
"EngineCore loop active - local unfinished: %s, finished: %s.",
|
||||
self.scheduler.has_unfinished_requests(),
|
||||
self.scheduler.has_finished_requests())
|
||||
logger.debug("EngineCore loop active.")
|
||||
|
||||
# Handle any more client requests.
|
||||
while not self.input_queue.empty():
|
||||
@@ -446,10 +442,6 @@ class EngineCoreProc(EngineCore):
|
||||
self.add_request(request)
|
||||
elif request_type == EngineCoreRequestType.ABORT:
|
||||
self.abort_requests(request)
|
||||
elif request_type == EngineCoreRequestType.START_DP:
|
||||
if not self.global_unfinished_reqs:
|
||||
logger.debug("EngineCore starting idle loop.")
|
||||
self.global_unfinished_reqs = True
|
||||
elif request_type == EngineCoreRequestType.UTILITY:
|
||||
call_id, method_name, args = request
|
||||
output = UtilityOutput(call_id)
|
||||
@@ -548,9 +540,6 @@ class EngineCoreProc(EngineCore):
|
||||
socket.send_multipart(buffers, copy=False)
|
||||
|
||||
|
||||
ENGINE_PAUSED_OUTPUTS = EngineCoreOutputs(engine_paused=True)
|
||||
|
||||
|
||||
class DPEngineCoreProc(EngineCoreProc):
|
||||
"""ZMQ-wrapper for running EngineCore in background process
|
||||
in a data parallel context."""
|
||||
@@ -587,7 +576,9 @@ class DPEngineCoreProc(EngineCoreProc):
|
||||
for i in range(local_dp_rank * tp_size, (local_dp_rank + 1) *
|
||||
tp_size))
|
||||
|
||||
self.local_dp_rank = local_dp_rank
|
||||
self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
|
||||
self.current_wave = 0
|
||||
|
||||
# Initialize the engine after setting up environment.
|
||||
super().__init__(input_path, output_path, vllm_config, executor_class,
|
||||
@@ -602,6 +593,31 @@ class DPEngineCoreProc(EngineCoreProc):
|
||||
if dp_group := getattr(self, "dp_group", None):
|
||||
stateless_destroy_torch_distributed_process_group(dp_group)
|
||||
|
||||
def add_request(self, request: EngineCoreRequest):
|
||||
if request.current_wave != self.current_wave:
|
||||
if request.current_wave > self.current_wave:
|
||||
self.current_wave = request.current_wave
|
||||
elif not self.engines_running:
|
||||
# Request received for an already-completed wave, notify
|
||||
# front-end that we need to start the next one.
|
||||
self.output_queue.put_nowait(
|
||||
EngineCoreOutputs(start_wave=self.current_wave))
|
||||
|
||||
super().add_request(request)
|
||||
|
||||
def _handle_client_request(self, request_type: EngineCoreRequestType,
|
||||
request: Any) -> None:
|
||||
if request_type == EngineCoreRequestType.START_DP_WAVE:
|
||||
new_wave: int = request
|
||||
if new_wave >= self.current_wave:
|
||||
self.current_wave = new_wave
|
||||
if not self.engines_running:
|
||||
logger.debug("EngineCore starting idle loop for wave %d.",
|
||||
new_wave)
|
||||
self.engines_running = True
|
||||
else:
|
||||
super()._handle_client_request(request_type, request)
|
||||
|
||||
def run_busy_loop(self):
|
||||
"""Core busy loop of the EngineCore for data parallel case."""
|
||||
|
||||
@@ -628,7 +644,7 @@ class DPEngineCoreProc(EngineCoreProc):
|
||||
# up-to-date state is returned in the engine outputs.
|
||||
self._process_engine_step()
|
||||
|
||||
if not self.global_unfinished_reqs:
|
||||
if not self.engines_running:
|
||||
# All engines are idle.
|
||||
continue
|
||||
|
||||
@@ -637,18 +653,23 @@ class DPEngineCoreProc(EngineCoreProc):
|
||||
self.execute_dummy_batch()
|
||||
|
||||
# 3) All-reduce operation to determine global unfinished reqs.
|
||||
self.global_unfinished_reqs = self._has_global_unfinished_reqs(
|
||||
self.engines_running = self._has_global_unfinished_reqs(
|
||||
local_unfinished_reqs)
|
||||
|
||||
if not self.global_unfinished_reqs:
|
||||
# Notify client that we are pausing the loop.
|
||||
self.output_queue.put_nowait(ENGINE_PAUSED_OUTPUTS)
|
||||
if not self.engines_running:
|
||||
if self.local_dp_rank == 0:
|
||||
# Notify client that we are pausing the loop.
|
||||
logger.debug("Wave %d finished, pausing engine loop.",
|
||||
self.current_wave)
|
||||
self.output_queue.put_nowait(
|
||||
EngineCoreOutputs(wave_complete=self.current_wave))
|
||||
self.current_wave += 1
|
||||
|
||||
def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
|
||||
|
||||
# Optimization - only perform finish-sync all-reduce every 16 steps.
|
||||
# Optimization - only perform finish-sync all-reduce every 24 steps.
|
||||
self.counter += 1
|
||||
if self.counter != 16:
|
||||
if self.counter != 24:
|
||||
return True
|
||||
self.counter = 0
|
||||
|
||||
|
||||
Reference in New Issue
Block a user