[V1] Support DP with Ray (#18779)

2025-06-02 21:15:13 -07:00
parent 9e6f61e8c3
commit bdce64f236
10 changed files with 551 additions and 120 deletions
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -6,8 +6,9 @@ import sys
 import threading
 import time
 from collections import deque
+from collections.abc import Generator
 from concurrent.futures import Future
-from contextlib import ExitStack
+from contextlib import ExitStack, contextmanager
 from inspect import isclass, signature
 from logging import DEBUG
 from typing import Any, Callable, Optional, TypeVar, Union
@@ -367,42 +368,66 @@ class EngineCoreProc(EngineCore):
        log_stats: bool,
        engine_index: int = 0,
    ):
-        input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
-
-        executor_fail_callback = lambda: input_queue.put_nowait(
+        self.input_queue = queue.Queue[tuple[EngineCoreRequestType, Any]]()
+        self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs],
+                                              bytes]]()
+        executor_fail_callback = lambda: self.input_queue.put_nowait(
            (EngineCoreRequestType.EXECUTOR_FAILED, b''))

-        # Create input socket.
+        self.engine_index = engine_index
+        identity = self.engine_index.to_bytes(length=2, byteorder="little")
+        self.engines_running = False
+
+        with self._perform_handshake(handshake_address, identity, on_head_node,
+                                     vllm_config) as addresses:
+            self.client_count = len(addresses.outputs)
+
+            # Set up data parallel environment.
+            self.has_coordinator = addresses.coordinator_output is not None
+            self._init_data_parallel(vllm_config)
+
+            super().__init__(vllm_config, executor_class, log_stats,
+                             executor_fail_callback)
+
+        self.step_fn = (self.step if self.batch_queue is None else
+                        self.step_with_batch_queue)
+
+        # Background Threads and Queues for IO. These enable us to
+        # overlap ZMQ socket IO with GPU since they release the GIL,
+        # and to overlap some serialization/deserialization with the
+        # model forward pass.
+        # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
+        threading.Thread(target=self.process_input_sockets,
+                         args=(addresses.inputs, addresses.coordinator_input,
+                               identity),
+                         daemon=True).start()
+        self.output_thread = threading.Thread(
+            target=self.process_output_sockets,
+            args=(addresses.outputs, addresses.coordinator_output,
+                  self.engine_index),
+            daemon=True)
+        self.output_thread.start()
+
+    @contextmanager
+    def _perform_handshake(
+            self, handshake_address: str, identity: bytes, on_head_node: bool,
+            vllm_config: VllmConfig
+    ) -> Generator[EngineZmqAddresses, None, None]:
        input_ctx = zmq.Context()
-        identity = engine_index.to_bytes(length=2, byteorder="little")
        with make_zmq_socket(input_ctx,
                             handshake_address,
                             zmq.DEALER,
                             identity=identity,
                             linger=5000,
                             bind=False) as handshake_socket:
-
            # Register engine with front-end.
            addresses = self.startup_handshake(handshake_socket, on_head_node,
                                               vllm_config.parallel_config)
-            self.client_count = len(addresses.outputs)

-            # Update config which may have changed from the handshake.
+            # Update config which may have changed from the handshake
            vllm_config.__post_init__()

-            # Set up data parallel environment.
-            self.has_coordinator = addresses.coordinator_output is not None
-            self._init_data_parallel(vllm_config)
-
-            # Initialize engine core and model.
-            super().__init__(vllm_config, executor_class, log_stats,
-                             executor_fail_callback)
-
-            self.engine_index = engine_index
-            self.step_fn = (self.step if self.batch_queue is None else
-                            self.step_with_batch_queue)
-            self.engines_running = False
-            self.last_counts = (0, 0)
+            yield addresses

            # Send ready message.
            num_gpu_blocks = vllm_config.cache_config.num_gpu_blocks
@@ -413,25 +438,6 @@ class EngineCoreProc(EngineCore):
                    "num_gpu_blocks": num_gpu_blocks,
                }))

-        # Background Threads and Queues for IO. These enable us to
-        # overlap ZMQ socket IO with GPU since they release the GIL,
-        # and to overlap some serialization/deserialization with the
-        # model forward pass.
-        # Threads handle Socket <-> Queues and core_busy_loop uses Queue.
-        self.input_queue = input_queue
-        self.output_queue = queue.Queue[Union[tuple[int, EngineCoreOutputs],
-                                              bytes]]()
-        threading.Thread(target=self.process_input_sockets,
-                         args=(addresses.inputs, addresses.coordinator_input,
-                               identity),
-                         daemon=True).start()
-        self.output_thread = threading.Thread(
-            target=self.process_output_sockets,
-            args=(addresses.outputs, addresses.coordinator_output,
-                  engine_index),
-            daemon=True)
-        self.output_thread.start()
-
    @staticmethod
    def startup_handshake(
            handshake_socket: zmq.Socket, on_head_node: bool,
@@ -743,6 +749,21 @@ class DPEngineCoreProc(EngineCoreProc):
        executor_class: type[Executor],
        log_stats: bool,
    ):
+
+        self._decorate_logs()
+
+        # Counts forward-passes of the model so that we can synchronize
+        # finished with DP peers every N steps.
+        self.counter = 0
+        self.current_wave = 0
+        self.last_counts = (0, 0)
+
+        # Initialize the engine.
+        dp_rank = vllm_config.parallel_config.data_parallel_rank
+        super().__init__(vllm_config, on_head_node, handshake_address,
+                         executor_class, log_stats, dp_rank)
+
+    def _decorate_logs(self):
        # Add process-specific prefix to stdout and stderr before
        # we initialize the engine.
        from multiprocessing import current_process
@@ -751,16 +772,6 @@ class DPEngineCoreProc(EngineCoreProc):
        _add_prefix(sys.stdout, process_name, pid)
        _add_prefix(sys.stderr, process_name, pid)

-        # Counts forward-passes of the model so that we can synchronize
-        # finished with DP peers every N steps.
-        self.counter = 0
-        self.current_wave = 0
-
-        # Initialize the engine.
-        dp_rank = vllm_config.parallel_config.data_parallel_rank
-        super().__init__(vllm_config, on_head_node, handshake_address,
-                         executor_class, log_stats, dp_rank)
-
    def _init_data_parallel(self, vllm_config: VllmConfig):

        # Configure GPUs and stateless process group for data parallel.
@@ -880,3 +891,70 @@ class DPEngineCoreProc(EngineCoreProc):

        return ParallelConfig.has_unfinished_dp(self.dp_group,
                                                local_unfinished)
+
+
+class DPEngineCoreActor(DPEngineCoreProc):
+    """
+    Ray actor for running EngineCore in a data parallel context
+    """
+
+    def __init__(
+        self,
+        vllm_config: VllmConfig,
+        on_head_node: bool,
+        addresses: EngineZmqAddresses,
+        executor_class: type[Executor],
+        log_stats: bool,
+        dp_rank: int = 0,
+        local_dp_rank: int = 0,
+    ):
+        self.addresses = addresses
+        vllm_config.parallel_config.data_parallel_rank = dp_rank
+        vllm_config.parallel_config.data_parallel_rank_local = \
+            local_dp_rank
+
+        # Ray sets CUDA_VISIBLE_DEVICES to empty string,
+        # we clean this up to be able to properly initialize
+        # data parallel groups.
+        del os.environ['CUDA_VISIBLE_DEVICES']
+
+        super().__init__(vllm_config, on_head_node, "", executor_class,
+                         log_stats)
+
+    def _decorate_logs(self):
+        pass
+
+    @contextmanager
+    def _perform_handshake(self, handshake_address: str, identity: bytes,
+                           on_head_node: bool, vllm_config: VllmConfig):
+        """
+        For Ray, we don't need to actually perform handshake.
+        All addresses information is known before the actor creation.
+        Therefore, we simply yield these addresses.
+        """
+        yield self.addresses
+
+    def wait_for_init(self):
+        """
+        Wait until the engine core is initialized.
+
+        This is just an empty method. When ray.get() on this method
+        (or any other method of the actor) returns, it is guaranteed
+        that actor creation (i.e., __init__) is complete.
+        """
+        pass
+
+    def run(self):
+        """
+        Run the engine core busy loop.
+        """
+        try:
+            self.run_busy_loop()
+        except SystemExit:
+            logger.debug("EngineCore exiting.")
+            raise
+        except Exception:
+            logger.exception("EngineCore encountered a fatal error.")
+            raise
+        finally:
+            self.shutdown()