[V1][DP] More robust DP/EP dummy request coordination (#16277)

Signed-off-by: Nick Hill <nhill@redhat.com>
This commit is contained in:
Nick Hill
2025-04-22 19:12:15 -07:00
committed by GitHub
parent bc7c4d206b
commit 1e013fa388
4 changed files with 94 additions and 57 deletions

View File

@@ -325,7 +325,7 @@ class EngineCoreProc(EngineCore):
self.step_fn = (self.step if self.batch_queue is None else
self.step_with_batch_queue)
self.global_unfinished_reqs = False
self.engines_running = False
# Background Threads and Queues for IO. These enable us to
# overlap ZMQ socket IO with GPU since they release the GIL,
@@ -410,8 +410,7 @@ class EngineCoreProc(EngineCore):
"""Exits when an engine step needs to be performed."""
waited = False
while not self.global_unfinished_reqs and not (
self.scheduler.has_requests()):
while not self.engines_running and not (self.scheduler.has_requests()):
if logger.isEnabledFor(DEBUG) and self.input_queue.empty():
logger.debug("EngineCore waiting for work.")
waited = True
@@ -419,10 +418,7 @@ class EngineCoreProc(EngineCore):
self._handle_client_request(*req)
if waited:
logger.debug(
"EngineCore loop active - local unfinished: %s, finished: %s.",
self.scheduler.has_unfinished_requests(),
self.scheduler.has_finished_requests())
logger.debug("EngineCore loop active.")
# Handle any more client requests.
while not self.input_queue.empty():
@@ -446,10 +442,6 @@ class EngineCoreProc(EngineCore):
self.add_request(request)
elif request_type == EngineCoreRequestType.ABORT:
self.abort_requests(request)
elif request_type == EngineCoreRequestType.START_DP:
if not self.global_unfinished_reqs:
logger.debug("EngineCore starting idle loop.")
self.global_unfinished_reqs = True
elif request_type == EngineCoreRequestType.UTILITY:
call_id, method_name, args = request
output = UtilityOutput(call_id)
@@ -548,9 +540,6 @@ class EngineCoreProc(EngineCore):
socket.send_multipart(buffers, copy=False)
ENGINE_PAUSED_OUTPUTS = EngineCoreOutputs(engine_paused=True)
class DPEngineCoreProc(EngineCoreProc):
"""ZMQ-wrapper for running EngineCore in background process
in a data parallel context."""
@@ -587,7 +576,9 @@ class DPEngineCoreProc(EngineCoreProc):
for i in range(local_dp_rank * tp_size, (local_dp_rank + 1) *
tp_size))
self.local_dp_rank = local_dp_rank
self.dp_group = vllm_config.parallel_config.stateless_init_dp_group()
self.current_wave = 0
# Initialize the engine after setting up environment.
super().__init__(input_path, output_path, vllm_config, executor_class,
@@ -602,6 +593,31 @@ class DPEngineCoreProc(EngineCoreProc):
if dp_group := getattr(self, "dp_group", None):
stateless_destroy_torch_distributed_process_group(dp_group)
def add_request(self, request: EngineCoreRequest):
if request.current_wave != self.current_wave:
if request.current_wave > self.current_wave:
self.current_wave = request.current_wave
elif not self.engines_running:
# Request received for an already-completed wave, notify
# front-end that we need to start the next one.
self.output_queue.put_nowait(
EngineCoreOutputs(start_wave=self.current_wave))
super().add_request(request)
def _handle_client_request(self, request_type: EngineCoreRequestType,
request: Any) -> None:
if request_type == EngineCoreRequestType.START_DP_WAVE:
new_wave: int = request
if new_wave >= self.current_wave:
self.current_wave = new_wave
if not self.engines_running:
logger.debug("EngineCore starting idle loop for wave %d.",
new_wave)
self.engines_running = True
else:
super()._handle_client_request(request_type, request)
def run_busy_loop(self):
"""Core busy loop of the EngineCore for data parallel case."""
@@ -628,7 +644,7 @@ class DPEngineCoreProc(EngineCoreProc):
# up-to-date state is returned in the engine outputs.
self._process_engine_step()
if not self.global_unfinished_reqs:
if not self.engines_running:
# All engines are idle.
continue
@@ -637,18 +653,23 @@ class DPEngineCoreProc(EngineCoreProc):
self.execute_dummy_batch()
# 3) All-reduce operation to determine global unfinished reqs.
self.global_unfinished_reqs = self._has_global_unfinished_reqs(
self.engines_running = self._has_global_unfinished_reqs(
local_unfinished_reqs)
if not self.global_unfinished_reqs:
# Notify client that we are pausing the loop.
self.output_queue.put_nowait(ENGINE_PAUSED_OUTPUTS)
if not self.engines_running:
if self.local_dp_rank == 0:
# Notify client that we are pausing the loop.
logger.debug("Wave %d finished, pausing engine loop.",
self.current_wave)
self.output_queue.put_nowait(
EngineCoreOutputs(wave_complete=self.current_wave))
self.current_wave += 1
def _has_global_unfinished_reqs(self, local_unfinished: bool) -> bool:
# Optimization - only perform finish-sync all-reduce every 16 steps.
# Optimization - only perform finish-sync all-reduce every 24 steps.
self.counter += 1
if self.counter != 16:
if self.counter != 24:
return True
self.counter = 0