[Core] Remove busy loop from idle buffer readers (#28053)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com>
Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com>
Signed-off-by: Nick Hill <nickhill123@gmail.com>
Co-authored-by: Travis Johnson <tsjohnso@us.ibm.com>
Co-authored-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
Joe Runde
2026-03-04 00:44:20 -07:00
committed by GitHub
parent 5d199ac8f2
commit 6f0dd93801
5 changed files with 579 additions and 135 deletions

View File

@@ -104,7 +104,6 @@ class MultiprocExecutor(Executor):
# and ensure workers will be terminated.
self._finalizer = weakref.finalize(self, self.shutdown)
self.is_failed = False
self.shutdown_event = threading.Event()
self.failure_callback: FailureCallback | None = None
tp_size, pp_size, pcp_size = self._get_parallel_sizes()
@@ -158,20 +157,31 @@ class MultiprocExecutor(Executor):
global_start_rank = (
self.local_world_size * self.parallel_config.node_rank_within_dp
)
# Keep track of socket file descriptors that are inherited by the
# worker when using fork, so that we can close them in subsequent
# workers
inherited_fds: list[int] = []
for local_rank in range(self.local_world_size):
global_rank = global_start_rank + local_rank
is_driver_worker = self._is_driver_worker(global_rank)
unready_workers.append(
WorkerProc.make_worker_process(
vllm_config=self.vllm_config,
local_rank=local_rank,
rank=global_rank,
distributed_init_method=distributed_init_method,
input_shm_handle=scheduler_output_handle,
shared_worker_lock=shared_worker_lock,
is_driver_worker=is_driver_worker,
)
unready_worker_handle = WorkerProc.make_worker_process(
vllm_config=self.vllm_config,
local_rank=local_rank,
rank=global_rank,
distributed_init_method=distributed_init_method,
input_shm_handle=scheduler_output_handle,
shared_worker_lock=shared_worker_lock,
is_driver_worker=is_driver_worker,
inherited_fds=inherited_fds,
)
unready_workers.append(unready_worker_handle)
if context.get_start_method() == "fork":
inherited_fds.extend(
[
unready_worker_handle.death_writer.fileno(),
unready_worker_handle.ready_pipe.fileno(),
]
)
# Workers must be created before wait_for_ready to avoid
# deadlock, since worker.init_device() does a device sync.
@@ -220,6 +230,7 @@ class MultiprocExecutor(Executor):
for uw in unready_workers:
if uw.death_writer is not None:
uw.death_writer.close()
uw.death_writer = None
self._ensure_worker_termination([uw.proc for uw in unready_workers])
self.output_rank = self._get_output_rank()
@@ -255,6 +266,7 @@ class MultiprocExecutor(Executor):
died = multiprocessing.connection.wait(sentinels)
_self = self_ref()
if not _self or getattr(_self, "shutting_down", False):
logger.debug("MultiprocWorkerMonitor: shutdown already initiated")
return
_self.is_failed = True
proc_name = next(h.proc.name for h in workers if h.proc.sentinel == died[0])
@@ -354,8 +366,6 @@ class MultiprocExecutor(Executor):
if output_rank is not None:
response_mqs = (response_mqs[output_rank],)
shutdown_event = self.shutdown_event
def get_response():
responses = []
for mq in response_mqs:
@@ -363,9 +373,7 @@ class MultiprocExecutor(Executor):
None if deadline is None else (deadline - time.monotonic())
)
try:
status, result = mq.dequeue(
timeout=dequeue_timeout, cancel=shutdown_event
)
status, result = mq.dequeue(timeout=dequeue_timeout)
except TimeoutError as e:
raise TimeoutError(f"RPC call to {method} timed out.") from e
if status != WorkerProc.ResponseStatus.SUCCESS:
@@ -408,20 +416,26 @@ class MultiprocExecutor(Executor):
active_procs = lambda: [proc for proc in worker_procs if proc.is_alive()]
# Give processes time to clean themselves up properly first
logger.debug("Worker Termination: allow workers to gracefully shutdown")
if wait_for_termination(active_procs(), 4):
return
# Send SIGTERM if still running
logger.debug("Worker Termination: workers still running sending SIGTERM")
for p in active_procs():
p.terminate()
if not wait_for_termination(active_procs(), 4):
# Send SIGKILL if still running
logger.debug(
"Worker Termination: resorting to SIGKILL to take down workers"
)
for p in active_procs():
p.kill()
def shutdown(self):
"""Properly shut down the executor and its workers"""
if not getattr(self, "shutting_down", False):
logger.debug("Triggering shutdown of workers")
self.shutting_down = True
# Make sure all the worker processes are terminated first.
@@ -431,12 +445,20 @@ class MultiprocExecutor(Executor):
if w.death_writer is not None:
w.death_writer.close()
w.death_writer = None
w.worker_response_mq = None
self._ensure_worker_termination([w.proc for w in workers])
self.shutdown_event.set()
for w in workers:
# Shutdown response queues
if w.worker_response_mq is not None:
w.worker_response_mq.shutdown()
w.worker_response_mq = None
self.rpc_broadcast_mq = None
if self.rpc_broadcast_mq is not None:
self.rpc_broadcast_mq.shutdown()
self.rpc_broadcast_mq = None
for mq in self.response_mqs:
mq.shutdown()
self.response_mqs = []
def check_health(self) -> None:
self.collective_rpc("check_health", timeout=10)
@@ -609,24 +631,26 @@ class WorkerProc:
input_shm_handle, # Receive SchedulerOutput
shared_worker_lock: LockType,
is_driver_worker: bool,
inherited_fds: list[int],
) -> UnreadyWorkerProcHandle:
context = get_mp_context()
# (reader, writer)
reader, writer = context.Pipe(duplex=False)
# Create death pipe to detect parent process exit
# Ready pipe to communicate readiness from child to parent
ready_reader, ready_writer = context.Pipe(duplex=False)
# Death pipe to let child detect parent process exit
death_reader, death_writer = context.Pipe(duplex=False)
process_kwargs = {
"vllm_config": vllm_config,
"local_rank": local_rank,
"rank": rank,
"distributed_init_method": distributed_init_method,
"input_shm_handle": input_shm_handle,
"ready_pipe": (reader, writer),
"ready_pipe": ready_writer,
"death_pipe": death_reader,
"shared_worker_lock": shared_worker_lock,
"is_driver_worker": is_driver_worker,
# Have the worker close parent end of this worker's pipes too
"inherited_fds": inherited_fds
+ [ready_reader.fileno(), death_writer.fileno()],
}
# Run EngineCore busy loop in background process.
proc = context.Process(
@@ -637,10 +661,12 @@ class WorkerProc:
)
proc.start()
writer.close()
# Close child ends of pipes here in the parent
ready_writer.close()
death_reader.close()
# Keep death_writer open in parent - when parent exits,
# death_reader in child will get EOFError
return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
return UnreadyWorkerProcHandle(proc, rank, ready_reader, death_writer)
@staticmethod
def wait_for_response_handle_ready(
@@ -703,12 +729,41 @@ class WorkerProc:
return cast(list[WorkerProcHandle], ready_proc_handles)
def shutdown(self):
if self.rpc_broadcast_mq is not None:
self.rpc_broadcast_mq.shutdown()
if self.worker_response_mq is not None:
self.worker_response_mq.shutdown()
self.worker.shutdown()
self.rpc_broadcast_mq = None
self.worker_response_mq = None
destroy_model_parallel()
destroy_distributed_environment()
def monitor_death_pipe(self, death_pipe, shutdown_requested: threading.Event):
if death_pipe is None:
return
def death_pipe_monitor(queues_to_shutdown: list[MessageQueue]):
try:
# This will block until parent process exits (pipe closes)
death_pipe.recv()
except EOFError:
logger.info_once("Parent process exited, terminating worker queues")
shutdown_requested.set()
for mq in queues_to_shutdown:
if mq is not None:
mq.shutdown()
except Exception as e:
logger.warning("Death monitoring error: %s", e)
# Pass queue references directly to avoid gc issues if passing self
Thread(
target=death_pipe_monitor,
args=([self.rpc_broadcast_mq, self.worker_response_mq],),
daemon=True,
name="DeathPipeMonitor",
).start()
@staticmethod
def worker_main(*args, **kwargs):
"""Worker initialization and execution loops.
@@ -717,12 +772,12 @@ class WorkerProc:
# Signal handler used for graceful termination.
# SystemExit exception is only raised once to allow this and worker
# processes to terminate without error
shutdown_requested = False
shutdown_requested = threading.Event()
def signal_handler(signum, frame):
nonlocal shutdown_requested
if not shutdown_requested:
shutdown_requested = True
if not shutdown_requested.is_set():
shutdown_requested.set()
logger.debug(
"WorkerProc handling signal %d, raising SystemExit", signum
)
@@ -733,33 +788,20 @@ class WorkerProc:
signal.signal(signal.SIGINT, signal_handler)
worker = None
# tuple[Connection, Connection]
reader, ready_writer = kwargs.pop("ready_pipe")
death_pipe: Connection | None = kwargs.pop("death_pipe", None)
shutdown_event = threading.Event()
# Start death monitoring thread if death_pipe is provided
if death_pipe is not None:
ready_writer = kwargs.pop("ready_pipe")
death_pipe = kwargs.pop("death_pipe", None)
def monitor_parent_death():
try:
# This will block until parent process exits (pipe closes)
death_pipe.recv()
except EOFError:
# Parent process has exited, terminate this worker
logger.info_once("Parent process exited, terminating worker")
# Send signal to self to trigger clean shutdown
shutdown_event.set()
except Exception as e:
logger.warning("Death monitoring error: %s", e)
death_monitor = Thread(
target=monitor_parent_death, daemon=True, name="WorkerDeathMonitor"
)
death_monitor.start()
# Close inherited pipes from parent (incl. other worker pipes)
# Explicitly passing in existing pipes and closing them makes the pipe
# behave when using fork. Otherwise, a hidden reference to the pipes
# exist in the child process and prevents EOF closure.
for fd in kwargs.pop("inherited_fds", []):
try:
os.close(fd)
except Exception as e:
logger.warning("Exception closing inherited connection: %s", e)
try:
reader.close()
# Initialize tracer
rank = kwargs.get("rank", 0)
maybe_init_worker_tracer(
@@ -771,6 +813,8 @@ class WorkerProc:
worker = WorkerProc(*args, **kwargs)
assert worker.worker_response_mq is not None
worker.monitor_death_pipe(death_pipe, shutdown_requested)
# Send READY once we know everything is loaded
ready_writer.send(
{
@@ -788,7 +832,7 @@ class WorkerProc:
ready_writer.close()
ready_writer = None
worker.worker_busy_loop(cancel=shutdown_event)
worker.worker_busy_loop()
except Exception:
# NOTE: if an Exception arises in busy_loop, we send
@@ -798,7 +842,7 @@ class WorkerProc:
if ready_writer is not None:
logger.exception("WorkerProc failed to start.")
elif shutdown_event.is_set():
elif shutdown_requested.is_set():
logger.info("WorkerProc shutting down.")
else:
logger.exception("WorkerProc failed.")
@@ -806,7 +850,7 @@ class WorkerProc:
# The parent sends a SIGTERM to all worker processes if
# any worker dies. Set this value so we don't re-throw
# SystemExit() to avoid zmq exceptions in __del__.
shutdown_requested = True
shutdown_requested.set()
except SystemExit as e:
# SystemExit is raised on SIGTERM or SIGKILL, which usually indicates that
@@ -859,12 +903,12 @@ class WorkerProc:
output = self.async_output_queue.get()
self.enqueue_output(output)
def worker_busy_loop(self, cancel: threading.Event | None = None):
def worker_busy_loop(self):
"""Main busy loop for Multiprocessing Workers"""
assert self.rpc_broadcast_mq is not None
while True:
method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue(
cancel=cancel, indefinite=True
indefinite=True
)
try:
if isinstance(method, str):