[Core] Remove busy loop from idle buffer readers (#28053)
Signed-off-by: Joe Runde <Joseph.Runde@ibm.com> Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com> Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Travis Johnson <tsjohnso@us.ibm.com> Co-authored-by: Nick Hill <nickhill123@gmail.com>
This commit is contained in:
@@ -104,7 +104,6 @@ class MultiprocExecutor(Executor):
|
||||
# and ensure workers will be terminated.
|
||||
self._finalizer = weakref.finalize(self, self.shutdown)
|
||||
self.is_failed = False
|
||||
self.shutdown_event = threading.Event()
|
||||
self.failure_callback: FailureCallback | None = None
|
||||
|
||||
tp_size, pp_size, pcp_size = self._get_parallel_sizes()
|
||||
@@ -158,20 +157,31 @@ class MultiprocExecutor(Executor):
|
||||
global_start_rank = (
|
||||
self.local_world_size * self.parallel_config.node_rank_within_dp
|
||||
)
|
||||
# Keep track of socket file descriptors that are inherited by the
|
||||
# worker when using fork, so that we can close them in subsequent
|
||||
# workers
|
||||
inherited_fds: list[int] = []
|
||||
for local_rank in range(self.local_world_size):
|
||||
global_rank = global_start_rank + local_rank
|
||||
is_driver_worker = self._is_driver_worker(global_rank)
|
||||
unready_workers.append(
|
||||
WorkerProc.make_worker_process(
|
||||
vllm_config=self.vllm_config,
|
||||
local_rank=local_rank,
|
||||
rank=global_rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
input_shm_handle=scheduler_output_handle,
|
||||
shared_worker_lock=shared_worker_lock,
|
||||
is_driver_worker=is_driver_worker,
|
||||
)
|
||||
unready_worker_handle = WorkerProc.make_worker_process(
|
||||
vllm_config=self.vllm_config,
|
||||
local_rank=local_rank,
|
||||
rank=global_rank,
|
||||
distributed_init_method=distributed_init_method,
|
||||
input_shm_handle=scheduler_output_handle,
|
||||
shared_worker_lock=shared_worker_lock,
|
||||
is_driver_worker=is_driver_worker,
|
||||
inherited_fds=inherited_fds,
|
||||
)
|
||||
unready_workers.append(unready_worker_handle)
|
||||
if context.get_start_method() == "fork":
|
||||
inherited_fds.extend(
|
||||
[
|
||||
unready_worker_handle.death_writer.fileno(),
|
||||
unready_worker_handle.ready_pipe.fileno(),
|
||||
]
|
||||
)
|
||||
|
||||
# Workers must be created before wait_for_ready to avoid
|
||||
# deadlock, since worker.init_device() does a device sync.
|
||||
@@ -220,6 +230,7 @@ class MultiprocExecutor(Executor):
|
||||
for uw in unready_workers:
|
||||
if uw.death_writer is not None:
|
||||
uw.death_writer.close()
|
||||
uw.death_writer = None
|
||||
self._ensure_worker_termination([uw.proc for uw in unready_workers])
|
||||
|
||||
self.output_rank = self._get_output_rank()
|
||||
@@ -255,6 +266,7 @@ class MultiprocExecutor(Executor):
|
||||
died = multiprocessing.connection.wait(sentinels)
|
||||
_self = self_ref()
|
||||
if not _self or getattr(_self, "shutting_down", False):
|
||||
logger.debug("MultiprocWorkerMonitor: shutdown already initiated")
|
||||
return
|
||||
_self.is_failed = True
|
||||
proc_name = next(h.proc.name for h in workers if h.proc.sentinel == died[0])
|
||||
@@ -354,8 +366,6 @@ class MultiprocExecutor(Executor):
|
||||
if output_rank is not None:
|
||||
response_mqs = (response_mqs[output_rank],)
|
||||
|
||||
shutdown_event = self.shutdown_event
|
||||
|
||||
def get_response():
|
||||
responses = []
|
||||
for mq in response_mqs:
|
||||
@@ -363,9 +373,7 @@ class MultiprocExecutor(Executor):
|
||||
None if deadline is None else (deadline - time.monotonic())
|
||||
)
|
||||
try:
|
||||
status, result = mq.dequeue(
|
||||
timeout=dequeue_timeout, cancel=shutdown_event
|
||||
)
|
||||
status, result = mq.dequeue(timeout=dequeue_timeout)
|
||||
except TimeoutError as e:
|
||||
raise TimeoutError(f"RPC call to {method} timed out.") from e
|
||||
if status != WorkerProc.ResponseStatus.SUCCESS:
|
||||
@@ -408,20 +416,26 @@ class MultiprocExecutor(Executor):
|
||||
|
||||
active_procs = lambda: [proc for proc in worker_procs if proc.is_alive()]
|
||||
# Give processes time to clean themselves up properly first
|
||||
logger.debug("Worker Termination: allow workers to gracefully shutdown")
|
||||
if wait_for_termination(active_procs(), 4):
|
||||
return
|
||||
|
||||
# Send SIGTERM if still running
|
||||
logger.debug("Worker Termination: workers still running sending SIGTERM")
|
||||
for p in active_procs():
|
||||
p.terminate()
|
||||
if not wait_for_termination(active_procs(), 4):
|
||||
# Send SIGKILL if still running
|
||||
logger.debug(
|
||||
"Worker Termination: resorting to SIGKILL to take down workers"
|
||||
)
|
||||
for p in active_procs():
|
||||
p.kill()
|
||||
|
||||
def shutdown(self):
|
||||
"""Properly shut down the executor and its workers"""
|
||||
if not getattr(self, "shutting_down", False):
|
||||
logger.debug("Triggering shutdown of workers")
|
||||
self.shutting_down = True
|
||||
|
||||
# Make sure all the worker processes are terminated first.
|
||||
@@ -431,12 +445,20 @@ class MultiprocExecutor(Executor):
|
||||
if w.death_writer is not None:
|
||||
w.death_writer.close()
|
||||
w.death_writer = None
|
||||
w.worker_response_mq = None
|
||||
self._ensure_worker_termination([w.proc for w in workers])
|
||||
|
||||
self.shutdown_event.set()
|
||||
for w in workers:
|
||||
# Shutdown response queues
|
||||
if w.worker_response_mq is not None:
|
||||
w.worker_response_mq.shutdown()
|
||||
w.worker_response_mq = None
|
||||
|
||||
self.rpc_broadcast_mq = None
|
||||
if self.rpc_broadcast_mq is not None:
|
||||
self.rpc_broadcast_mq.shutdown()
|
||||
self.rpc_broadcast_mq = None
|
||||
for mq in self.response_mqs:
|
||||
mq.shutdown()
|
||||
self.response_mqs = []
|
||||
|
||||
def check_health(self) -> None:
|
||||
self.collective_rpc("check_health", timeout=10)
|
||||
@@ -609,24 +631,26 @@ class WorkerProc:
|
||||
input_shm_handle, # Receive SchedulerOutput
|
||||
shared_worker_lock: LockType,
|
||||
is_driver_worker: bool,
|
||||
inherited_fds: list[int],
|
||||
) -> UnreadyWorkerProcHandle:
|
||||
context = get_mp_context()
|
||||
# (reader, writer)
|
||||
reader, writer = context.Pipe(duplex=False)
|
||||
|
||||
# Create death pipe to detect parent process exit
|
||||
# Ready pipe to communicate readiness from child to parent
|
||||
ready_reader, ready_writer = context.Pipe(duplex=False)
|
||||
# Death pipe to let child detect parent process exit
|
||||
death_reader, death_writer = context.Pipe(duplex=False)
|
||||
|
||||
process_kwargs = {
|
||||
"vllm_config": vllm_config,
|
||||
"local_rank": local_rank,
|
||||
"rank": rank,
|
||||
"distributed_init_method": distributed_init_method,
|
||||
"input_shm_handle": input_shm_handle,
|
||||
"ready_pipe": (reader, writer),
|
||||
"ready_pipe": ready_writer,
|
||||
"death_pipe": death_reader,
|
||||
"shared_worker_lock": shared_worker_lock,
|
||||
"is_driver_worker": is_driver_worker,
|
||||
# Have the worker close parent end of this worker's pipes too
|
||||
"inherited_fds": inherited_fds
|
||||
+ [ready_reader.fileno(), death_writer.fileno()],
|
||||
}
|
||||
# Run EngineCore busy loop in background process.
|
||||
proc = context.Process(
|
||||
@@ -637,10 +661,12 @@ class WorkerProc:
|
||||
)
|
||||
|
||||
proc.start()
|
||||
writer.close()
|
||||
# Close child ends of pipes here in the parent
|
||||
ready_writer.close()
|
||||
death_reader.close()
|
||||
# Keep death_writer open in parent - when parent exits,
|
||||
# death_reader in child will get EOFError
|
||||
return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
|
||||
return UnreadyWorkerProcHandle(proc, rank, ready_reader, death_writer)
|
||||
|
||||
@staticmethod
|
||||
def wait_for_response_handle_ready(
|
||||
@@ -703,12 +729,41 @@ class WorkerProc:
|
||||
return cast(list[WorkerProcHandle], ready_proc_handles)
|
||||
|
||||
def shutdown(self):
|
||||
if self.rpc_broadcast_mq is not None:
|
||||
self.rpc_broadcast_mq.shutdown()
|
||||
if self.worker_response_mq is not None:
|
||||
self.worker_response_mq.shutdown()
|
||||
self.worker.shutdown()
|
||||
self.rpc_broadcast_mq = None
|
||||
self.worker_response_mq = None
|
||||
destroy_model_parallel()
|
||||
destroy_distributed_environment()
|
||||
|
||||
def monitor_death_pipe(self, death_pipe, shutdown_requested: threading.Event):
|
||||
if death_pipe is None:
|
||||
return
|
||||
|
||||
def death_pipe_monitor(queues_to_shutdown: list[MessageQueue]):
|
||||
try:
|
||||
# This will block until parent process exits (pipe closes)
|
||||
death_pipe.recv()
|
||||
except EOFError:
|
||||
logger.info_once("Parent process exited, terminating worker queues")
|
||||
shutdown_requested.set()
|
||||
for mq in queues_to_shutdown:
|
||||
if mq is not None:
|
||||
mq.shutdown()
|
||||
except Exception as e:
|
||||
logger.warning("Death monitoring error: %s", e)
|
||||
|
||||
# Pass queue references directly to avoid gc issues if passing self
|
||||
Thread(
|
||||
target=death_pipe_monitor,
|
||||
args=([self.rpc_broadcast_mq, self.worker_response_mq],),
|
||||
daemon=True,
|
||||
name="DeathPipeMonitor",
|
||||
).start()
|
||||
|
||||
@staticmethod
|
||||
def worker_main(*args, **kwargs):
|
||||
"""Worker initialization and execution loops.
|
||||
@@ -717,12 +772,12 @@ class WorkerProc:
|
||||
# Signal handler used for graceful termination.
|
||||
# SystemExit exception is only raised once to allow this and worker
|
||||
# processes to terminate without error
|
||||
shutdown_requested = False
|
||||
shutdown_requested = threading.Event()
|
||||
|
||||
def signal_handler(signum, frame):
|
||||
nonlocal shutdown_requested
|
||||
if not shutdown_requested:
|
||||
shutdown_requested = True
|
||||
if not shutdown_requested.is_set():
|
||||
shutdown_requested.set()
|
||||
logger.debug(
|
||||
"WorkerProc handling signal %d, raising SystemExit", signum
|
||||
)
|
||||
@@ -733,33 +788,20 @@ class WorkerProc:
|
||||
signal.signal(signal.SIGINT, signal_handler)
|
||||
|
||||
worker = None
|
||||
# tuple[Connection, Connection]
|
||||
reader, ready_writer = kwargs.pop("ready_pipe")
|
||||
death_pipe: Connection | None = kwargs.pop("death_pipe", None)
|
||||
shutdown_event = threading.Event()
|
||||
# Start death monitoring thread if death_pipe is provided
|
||||
if death_pipe is not None:
|
||||
ready_writer = kwargs.pop("ready_pipe")
|
||||
death_pipe = kwargs.pop("death_pipe", None)
|
||||
|
||||
def monitor_parent_death():
|
||||
try:
|
||||
# This will block until parent process exits (pipe closes)
|
||||
death_pipe.recv()
|
||||
except EOFError:
|
||||
# Parent process has exited, terminate this worker
|
||||
logger.info_once("Parent process exited, terminating worker")
|
||||
# Send signal to self to trigger clean shutdown
|
||||
shutdown_event.set()
|
||||
except Exception as e:
|
||||
logger.warning("Death monitoring error: %s", e)
|
||||
|
||||
death_monitor = Thread(
|
||||
target=monitor_parent_death, daemon=True, name="WorkerDeathMonitor"
|
||||
)
|
||||
death_monitor.start()
|
||||
# Close inherited pipes from parent (incl. other worker pipes)
|
||||
# Explicitly passing in existing pipes and closing them makes the pipe
|
||||
# behave when using fork. Otherwise, a hidden reference to the pipes
|
||||
# exist in the child process and prevents EOF closure.
|
||||
for fd in kwargs.pop("inherited_fds", []):
|
||||
try:
|
||||
os.close(fd)
|
||||
except Exception as e:
|
||||
logger.warning("Exception closing inherited connection: %s", e)
|
||||
|
||||
try:
|
||||
reader.close()
|
||||
|
||||
# Initialize tracer
|
||||
rank = kwargs.get("rank", 0)
|
||||
maybe_init_worker_tracer(
|
||||
@@ -771,6 +813,8 @@ class WorkerProc:
|
||||
worker = WorkerProc(*args, **kwargs)
|
||||
assert worker.worker_response_mq is not None
|
||||
|
||||
worker.monitor_death_pipe(death_pipe, shutdown_requested)
|
||||
|
||||
# Send READY once we know everything is loaded
|
||||
ready_writer.send(
|
||||
{
|
||||
@@ -788,7 +832,7 @@ class WorkerProc:
|
||||
ready_writer.close()
|
||||
ready_writer = None
|
||||
|
||||
worker.worker_busy_loop(cancel=shutdown_event)
|
||||
worker.worker_busy_loop()
|
||||
|
||||
except Exception:
|
||||
# NOTE: if an Exception arises in busy_loop, we send
|
||||
@@ -798,7 +842,7 @@ class WorkerProc:
|
||||
|
||||
if ready_writer is not None:
|
||||
logger.exception("WorkerProc failed to start.")
|
||||
elif shutdown_event.is_set():
|
||||
elif shutdown_requested.is_set():
|
||||
logger.info("WorkerProc shutting down.")
|
||||
else:
|
||||
logger.exception("WorkerProc failed.")
|
||||
@@ -806,7 +850,7 @@ class WorkerProc:
|
||||
# The parent sends a SIGTERM to all worker processes if
|
||||
# any worker dies. Set this value so we don't re-throw
|
||||
# SystemExit() to avoid zmq exceptions in __del__.
|
||||
shutdown_requested = True
|
||||
shutdown_requested.set()
|
||||
|
||||
except SystemExit as e:
|
||||
# SystemExit is raised on SIGTERM or SIGKILL, which usually indicates that
|
||||
@@ -859,12 +903,12 @@ class WorkerProc:
|
||||
output = self.async_output_queue.get()
|
||||
self.enqueue_output(output)
|
||||
|
||||
def worker_busy_loop(self, cancel: threading.Event | None = None):
|
||||
def worker_busy_loop(self):
|
||||
"""Main busy loop for Multiprocessing Workers"""
|
||||
assert self.rpc_broadcast_mq is not None
|
||||
while True:
|
||||
method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue(
|
||||
cancel=cancel, indefinite=True
|
||||
indefinite=True
|
||||
)
|
||||
try:
|
||||
if isinstance(method, str):
|
||||
|
||||
Reference in New Issue
Block a user