[Core] Shut down aDAG workers with clean async llm engine exit (#7224)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
2024-08-12 17:57:16 -07:00
parent 774cd1d3bf
commit 198d6a2898
5 changed files with 40 additions and 25 deletions
--- a/vllm/executor/ray_gpu_executor.py
+++ b/vllm/executor/ray_gpu_executor.py
@@ -60,6 +60,14 @@ class RayGPUExecutor(DistributedGPUExecutor):
        # Create the parallel GPU workers.
        self._init_workers_ray(placement_group)

+    def shutdown(self) -> None:
+        if hasattr(self, "forward_dag") and self.forward_dag is not None:
+            self.forward_dag.teardown()
+            import ray
+            for worker in self.workers:
+                ray.kill(worker)
+            self.forward_dag = None
+
    def _configure_ray_workers_use_nsight(self,
                                          ray_remote_kwargs) -> Dict[str, Any]:
        # If nsight profiling is enabled, we need to set the profiling
@@ -117,7 +125,6 @@ class RayGPUExecutor(DistributedGPUExecutor):
        logger.info("use_ray_spmd_worker: %s", self.use_ray_spmd_worker)
        # Create the workers.
        driver_ip = get_ip()
-        logger.info("driver_ip: %s", driver_ip)
        worker_wrapper_kwargs = self._get_worker_wrapper_args()
        for bundle_id, bundle in enumerate(placement_group.bundle_specs):
            if not bundle.get("GPU", 0):
@@ -446,11 +453,7 @@ class RayGPUExecutor(DistributedGPUExecutor):
        return forward_dag.experimental_compile(enable_asyncio=enable_asyncio)

    def __del__(self):
-        if self.forward_dag is not None:
-            self.forward_dag.teardown()
-            import ray
-            for worker in self.workers:
-                ray.kill(worker)
+        self.shutdown()


 class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
@@ -523,8 +526,4 @@ class RayGPUExecutorAsync(RayGPUExecutor, DistributedGPUExecutorAsync):
        return await asyncio.gather(*coros)

    def __del__(self):
-        if self.forward_dag is not None:
-            self.forward_dag.teardown()
-            import ray
-            for worker in self.workers:
-                ray.kill(worker)
+        self.shutdown()