[DP] Internal Load Balancing Per Node [one-pod-per-node] (#21238)

Signed-off-by: Robert Shaw <robshaw@redhat.com> Signed-off-by: Nick Hill <nhill@redhat.com> Signed-off-by: Tyler Michael Smith <tyler@neuralmagic.com> Co-authored-by: Robert Shaw <robshaw@redhat.com> Co-authored-by: Nick Hill <nhill@redhat.com> Co-authored-by: Tyler Michael Smith <tyler@neuralmagic.com>
2025-07-23 23:57:32 -04:00
parent eec6942014
commit d5b981f8b1
12 changed files with 486 additions and 45 deletions
--- a/vllm/v1/engine/core.py
+++ b/vllm/v1/engine/core.py
@@ -467,13 +467,14 @@ class EngineCoreProc(EngineCore):
        For DP>1 with internal loadbalancing this is with the shared front-end
        process which may reside on a different node.

-        For DP>1 with external loadbalancing, two handshakes are performed:
+        For DP>1 with external or hybrid loadbalancing, two handshakes are
+        performed:
            - With the rank 0 front-end process which retrieves the
              DP Coordinator ZMQ addresses and DP process group address.
            - With the colocated front-end process which retrieves the
              client input/output socket addresses.
-        with the exception of the rank 0 engine itself which doesn't require
-        the second handshake.
+        with the exception of the rank 0 and colocated engines themselves which
+        don't require the second handshake.

        Here, "front-end" process can mean the process containing the engine
        core client (which is the API server process in the case the API
@@ -482,15 +483,18 @@ class EngineCoreProc(EngineCore):
        """
        input_ctx = zmq.Context()
        is_local = local_client and client_handshake_address is None
+        headless = not local_client
        handshake = self._perform_handshake(input_ctx, handshake_address,
-                                            identity, is_local, vllm_config,
+                                            identity, is_local, headless,
+                                            vllm_config,
                                            vllm_config.parallel_config)
        if client_handshake_address is None:
            with handshake as addresses:
                yield addresses
        else:
+            assert local_client
            local_handshake = self._perform_handshake(
-                input_ctx, client_handshake_address, identity, local_client,
+                input_ctx, client_handshake_address, identity, True, False,
                vllm_config)
            with handshake as addresses, local_handshake as client_addresses:
                addresses.inputs = client_addresses.inputs
@@ -507,6 +511,7 @@ class EngineCoreProc(EngineCore):
        handshake_address: str,
        identity: bytes,
        local_client: bool,
+        headless: bool,
        vllm_config: VllmConfig,
        parallel_config_to_update: Optional[ParallelConfig] = None,
    ) -> Generator[EngineZmqAddresses, None, None]:
@@ -518,6 +523,7 @@ class EngineCoreProc(EngineCore):
                             bind=False) as handshake_socket:
            # Register engine with front-end.
            addresses = self.startup_handshake(handshake_socket, local_client,
+                                               headless,
                                               parallel_config_to_update)
            yield addresses

@@ -531,6 +537,7 @@ class EngineCoreProc(EngineCore):
                msgspec.msgpack.encode({
                    "status": "READY",
                    "local": local_client,
+                    "headless": headless,
                    "num_gpu_blocks": num_gpu_blocks,
                    "dp_stats_address": dp_stats_address,
                }))
@@ -539,6 +546,7 @@ class EngineCoreProc(EngineCore):
    def startup_handshake(
        handshake_socket: zmq.Socket,
        local_client: bool,
+        headless: bool,
        parallel_config: Optional[ParallelConfig] = None,
    ) -> EngineZmqAddresses:

@@ -547,6 +555,7 @@ class EngineCoreProc(EngineCore):
            msgspec.msgpack.encode({
                "status": "HELLO",
                "local": local_client,
+                "headless": headless,
            }))

        # Receive initialization message.