[Core] Remove busy loop from idle buffer readers (#28053)

Signed-off-by: Joe Runde <Joseph.Runde@ibm.com> Signed-off-by: Travis Johnson <tsjohnso@us.ibm.com> Signed-off-by: Nick Hill <nickhill123@gmail.com> Co-authored-by: Travis Johnson <tsjohnso@us.ibm.com> Co-authored-by: Nick Hill <nickhill123@gmail.com>
2026-03-04 00:44:20 -07:00
parent 5d199ac8f2
commit 6f0dd93801
5 changed files with 579 additions and 135 deletions
--- a/vllm/v1/executor/multiproc_executor.py
+++ b/vllm/v1/executor/multiproc_executor.py
@@ -104,7 +104,6 @@ class MultiprocExecutor(Executor):
        # and ensure workers will be terminated.
        self._finalizer = weakref.finalize(self, self.shutdown)
        self.is_failed = False
-        self.shutdown_event = threading.Event()
        self.failure_callback: FailureCallback | None = None

        tp_size, pp_size, pcp_size = self._get_parallel_sizes()
@@ -158,20 +157,31 @@ class MultiprocExecutor(Executor):
            global_start_rank = (
                self.local_world_size * self.parallel_config.node_rank_within_dp
            )
+            # Keep track of socket file descriptors that are inherited by the
+            # worker when using fork, so that we can close them in subsequent
+            # workers
+            inherited_fds: list[int] = []
            for local_rank in range(self.local_world_size):
                global_rank = global_start_rank + local_rank
                is_driver_worker = self._is_driver_worker(global_rank)
-                unready_workers.append(
-                    WorkerProc.make_worker_process(
-                        vllm_config=self.vllm_config,
-                        local_rank=local_rank,
-                        rank=global_rank,
-                        distributed_init_method=distributed_init_method,
-                        input_shm_handle=scheduler_output_handle,
-                        shared_worker_lock=shared_worker_lock,
-                        is_driver_worker=is_driver_worker,
-                    )
+                unready_worker_handle = WorkerProc.make_worker_process(
+                    vllm_config=self.vllm_config,
+                    local_rank=local_rank,
+                    rank=global_rank,
+                    distributed_init_method=distributed_init_method,
+                    input_shm_handle=scheduler_output_handle,
+                    shared_worker_lock=shared_worker_lock,
+                    is_driver_worker=is_driver_worker,
+                    inherited_fds=inherited_fds,
                )
+                unready_workers.append(unready_worker_handle)
+                if context.get_start_method() == "fork":
+                    inherited_fds.extend(
+                        [
+                            unready_worker_handle.death_writer.fileno(),
+                            unready_worker_handle.ready_pipe.fileno(),
+                        ]
+                    )

            # Workers must be created before wait_for_ready to avoid
            # deadlock, since worker.init_device() does a device sync.
@@ -220,6 +230,7 @@ class MultiprocExecutor(Executor):
                for uw in unready_workers:
                    if uw.death_writer is not None:
                        uw.death_writer.close()
+                        uw.death_writer = None
                self._ensure_worker_termination([uw.proc for uw in unready_workers])

        self.output_rank = self._get_output_rank()
@@ -255,6 +266,7 @@ class MultiprocExecutor(Executor):
            died = multiprocessing.connection.wait(sentinels)
            _self = self_ref()
            if not _self or getattr(_self, "shutting_down", False):
+                logger.debug("MultiprocWorkerMonitor: shutdown already initiated")
                return
            _self.is_failed = True
            proc_name = next(h.proc.name for h in workers if h.proc.sentinel == died[0])
@@ -354,8 +366,6 @@ class MultiprocExecutor(Executor):
        if output_rank is not None:
            response_mqs = (response_mqs[output_rank],)

-        shutdown_event = self.shutdown_event
-
        def get_response():
            responses = []
            for mq in response_mqs:
@@ -363,9 +373,7 @@ class MultiprocExecutor(Executor):
                    None if deadline is None else (deadline - time.monotonic())
                )
                try:
-                    status, result = mq.dequeue(
-                        timeout=dequeue_timeout, cancel=shutdown_event
-                    )
+                    status, result = mq.dequeue(timeout=dequeue_timeout)
                except TimeoutError as e:
                    raise TimeoutError(f"RPC call to {method} timed out.") from e
                if status != WorkerProc.ResponseStatus.SUCCESS:
@@ -408,20 +416,26 @@ class MultiprocExecutor(Executor):

        active_procs = lambda: [proc for proc in worker_procs if proc.is_alive()]
        # Give processes time to clean themselves up properly first
+        logger.debug("Worker Termination: allow workers to gracefully shutdown")
        if wait_for_termination(active_procs(), 4):
            return

        # Send SIGTERM if still running
+        logger.debug("Worker Termination: workers still running sending SIGTERM")
        for p in active_procs():
            p.terminate()
        if not wait_for_termination(active_procs(), 4):
            # Send SIGKILL if still running
+            logger.debug(
+                "Worker Termination: resorting to SIGKILL to take down workers"
+            )
            for p in active_procs():
                p.kill()

    def shutdown(self):
        """Properly shut down the executor and its workers"""
        if not getattr(self, "shutting_down", False):
+            logger.debug("Triggering shutdown of workers")
            self.shutting_down = True

            # Make sure all the worker processes are terminated first.
@@ -431,12 +445,20 @@ class MultiprocExecutor(Executor):
                    if w.death_writer is not None:
                        w.death_writer.close()
                        w.death_writer = None
-                    w.worker_response_mq = None
                self._ensure_worker_termination([w.proc for w in workers])

-            self.shutdown_event.set()
+                for w in workers:
+                    # Shutdown response queues
+                    if w.worker_response_mq is not None:
+                        w.worker_response_mq.shutdown()
+                        w.worker_response_mq = None

-        self.rpc_broadcast_mq = None
+        if self.rpc_broadcast_mq is not None:
+            self.rpc_broadcast_mq.shutdown()
+            self.rpc_broadcast_mq = None
+        for mq in self.response_mqs:
+            mq.shutdown()
+        self.response_mqs = []

    def check_health(self) -> None:
        self.collective_rpc("check_health", timeout=10)
@@ -609,24 +631,26 @@ class WorkerProc:
        input_shm_handle,  # Receive SchedulerOutput
        shared_worker_lock: LockType,
        is_driver_worker: bool,
+        inherited_fds: list[int],
    ) -> UnreadyWorkerProcHandle:
        context = get_mp_context()
-        # (reader, writer)
-        reader, writer = context.Pipe(duplex=False)
-
-        # Create death pipe to detect parent process exit
+        # Ready pipe to communicate readiness from child to parent
+        ready_reader, ready_writer = context.Pipe(duplex=False)
+        # Death pipe to let child detect parent process exit
        death_reader, death_writer = context.Pipe(duplex=False)
-
        process_kwargs = {
            "vllm_config": vllm_config,
            "local_rank": local_rank,
            "rank": rank,
            "distributed_init_method": distributed_init_method,
            "input_shm_handle": input_shm_handle,
-            "ready_pipe": (reader, writer),
+            "ready_pipe": ready_writer,
            "death_pipe": death_reader,
            "shared_worker_lock": shared_worker_lock,
            "is_driver_worker": is_driver_worker,
+            # Have the worker close parent end of this worker's pipes too
+            "inherited_fds": inherited_fds
+            + [ready_reader.fileno(), death_writer.fileno()],
        }
        # Run EngineCore busy loop in background process.
        proc = context.Process(
@@ -637,10 +661,12 @@ class WorkerProc:
        )

        proc.start()
-        writer.close()
+        # Close child ends of pipes here in the parent
+        ready_writer.close()
+        death_reader.close()
        # Keep death_writer open in parent - when parent exits,
        # death_reader in child will get EOFError
-        return UnreadyWorkerProcHandle(proc, rank, reader, death_writer)
+        return UnreadyWorkerProcHandle(proc, rank, ready_reader, death_writer)

    @staticmethod
    def wait_for_response_handle_ready(
@@ -703,12 +729,41 @@ class WorkerProc:
        return cast(list[WorkerProcHandle], ready_proc_handles)

    def shutdown(self):
+        if self.rpc_broadcast_mq is not None:
+            self.rpc_broadcast_mq.shutdown()
+        if self.worker_response_mq is not None:
+            self.worker_response_mq.shutdown()
        self.worker.shutdown()
        self.rpc_broadcast_mq = None
        self.worker_response_mq = None
        destroy_model_parallel()
        destroy_distributed_environment()

+    def monitor_death_pipe(self, death_pipe, shutdown_requested: threading.Event):
+        if death_pipe is None:
+            return
+
+        def death_pipe_monitor(queues_to_shutdown: list[MessageQueue]):
+            try:
+                # This will block until parent process exits (pipe closes)
+                death_pipe.recv()
+            except EOFError:
+                logger.info_once("Parent process exited, terminating worker queues")
+                shutdown_requested.set()
+                for mq in queues_to_shutdown:
+                    if mq is not None:
+                        mq.shutdown()
+            except Exception as e:
+                logger.warning("Death monitoring error: %s", e)
+
+        # Pass queue references directly to avoid gc issues if passing self
+        Thread(
+            target=death_pipe_monitor,
+            args=([self.rpc_broadcast_mq, self.worker_response_mq],),
+            daemon=True,
+            name="DeathPipeMonitor",
+        ).start()
+
    @staticmethod
    def worker_main(*args, **kwargs):
        """Worker initialization and execution loops.
@@ -717,12 +772,12 @@ class WorkerProc:
        # Signal handler used for graceful termination.
        # SystemExit exception is only raised once to allow this and worker
        # processes to terminate without error
-        shutdown_requested = False
+        shutdown_requested = threading.Event()

        def signal_handler(signum, frame):
            nonlocal shutdown_requested
-            if not shutdown_requested:
-                shutdown_requested = True
+            if not shutdown_requested.is_set():
+                shutdown_requested.set()
                logger.debug(
                    "WorkerProc handling signal %d, raising SystemExit", signum
                )
@@ -733,33 +788,20 @@ class WorkerProc:
        signal.signal(signal.SIGINT, signal_handler)

        worker = None
-        # tuple[Connection, Connection]
-        reader, ready_writer = kwargs.pop("ready_pipe")
-        death_pipe: Connection | None = kwargs.pop("death_pipe", None)
-        shutdown_event = threading.Event()
-        # Start death monitoring thread if death_pipe is provided
-        if death_pipe is not None:
+        ready_writer = kwargs.pop("ready_pipe")
+        death_pipe = kwargs.pop("death_pipe", None)

-            def monitor_parent_death():
-                try:
-                    # This will block until parent process exits (pipe closes)
-                    death_pipe.recv()
-                except EOFError:
-                    # Parent process has exited, terminate this worker
-                    logger.info_once("Parent process exited, terminating worker")
-                    # Send signal to self to trigger clean shutdown
-                    shutdown_event.set()
-                except Exception as e:
-                    logger.warning("Death monitoring error: %s", e)
-
-            death_monitor = Thread(
-                target=monitor_parent_death, daemon=True, name="WorkerDeathMonitor"
-            )
-            death_monitor.start()
+        # Close inherited pipes from parent (incl. other worker pipes)
+        # Explicitly passing in existing pipes and closing them makes the pipe
+        # behave when using fork. Otherwise, a hidden reference to the pipes
+        # exist in the child process and prevents EOF closure.
+        for fd in kwargs.pop("inherited_fds", []):
+            try:
+                os.close(fd)
+            except Exception as e:
+                logger.warning("Exception closing inherited connection: %s", e)

        try:
-            reader.close()
-
            # Initialize tracer
            rank = kwargs.get("rank", 0)
            maybe_init_worker_tracer(
@@ -771,6 +813,8 @@ class WorkerProc:
            worker = WorkerProc(*args, **kwargs)
            assert worker.worker_response_mq is not None

+            worker.monitor_death_pipe(death_pipe, shutdown_requested)
+
            # Send READY once we know everything is loaded
            ready_writer.send(
                {
@@ -788,7 +832,7 @@ class WorkerProc:
            ready_writer.close()
            ready_writer = None

-            worker.worker_busy_loop(cancel=shutdown_event)
+            worker.worker_busy_loop()

        except Exception:
            # NOTE: if an Exception arises in busy_loop, we send
@@ -798,7 +842,7 @@ class WorkerProc:

            if ready_writer is not None:
                logger.exception("WorkerProc failed to start.")
-            elif shutdown_event.is_set():
+            elif shutdown_requested.is_set():
                logger.info("WorkerProc shutting down.")
            else:
                logger.exception("WorkerProc failed.")
@@ -806,7 +850,7 @@ class WorkerProc:
            # The parent sends a SIGTERM to all worker processes if
            # any worker dies. Set this value so we don't re-throw
            # SystemExit() to avoid zmq exceptions in __del__.
-            shutdown_requested = True
+            shutdown_requested.set()

        except SystemExit as e:
            # SystemExit is raised on SIGTERM or SIGKILL, which usually indicates that
@@ -859,12 +903,12 @@ class WorkerProc:
            output = self.async_output_queue.get()
            self.enqueue_output(output)

-    def worker_busy_loop(self, cancel: threading.Event | None = None):
+    def worker_busy_loop(self):
        """Main busy loop for Multiprocessing Workers"""
        assert self.rpc_broadcast_mq is not None
        while True:
            method, args, kwargs, output_rank = self.rpc_broadcast_mq.dequeue(
-                cancel=cancel, indefinite=True
+                indefinite=True
            )
            try:
                if isinstance(method, str):