Ray placement group support (#397)

2023-07-19 22:49:31 -07:00
parent 8c4b2592fb
commit 9925c17940
5 changed files with 187 additions and 116 deletions
--- a/vllm/engine/ray_utils.py
+++ b/vllm/engine/ray_utils.py
@@ -1,15 +1,35 @@
 import socket
-from typing import List, Optional, Tuple
-
-try:
-    import ray
-except ImportError:
-    ray = None
+from typing import Optional, Tuple, TYPE_CHECKING

 from vllm.config import ParallelConfig

-# rank, node resource (node IP), device id
-DeviceID = Tuple[int, Optional[str], int]
+try:
+    import ray
+    from ray.air.util.torch_dist import TorchDistributedWorker
+
+    class RayWorker(TorchDistributedWorker):
+        """Ray wrapper for vllm.worker.Worker, allowing Worker to be
+        lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
+
+        def __init__(self) -> None:
+            self.worker = None
+
+        def init_worker(self, worker_init_fn):
+            self.worker = worker_init_fn()
+
+        def __getattr__(self, name):
+            return getattr(self.worker, name)
+
+        def execute_method(self, method, *args, **kwargs):
+            executor = getattr(self, method)
+            return executor(*args, **kwargs)
+
+except ImportError:
+    ray = None
+    TorchDistributedWorker = None
+
+if TYPE_CHECKING:
+    from ray.util.placement_group import PlacementGroup


 def get_open_port():
@@ -22,7 +42,7 @@ def initialize_cluster(
    parallel_config: ParallelConfig,
    engine_use_ray: bool = False,
    ray_address: Optional[str] = None,
-) -> Tuple[str, List[List[DeviceID]]]:
+) -> Tuple[str, Optional["PlacementGroup"]]:
    """Initialize the distributed cluster probably with Ray.

    Args:
@@ -52,63 +72,36 @@ def initialize_cluster(
        # We need to setup the distributed init method to make sure
        # the distributed megatron code (e.g., get world size) works correctly.
        distributed_init_method = f"tcp://localhost:{port}"
-        all_stage_devices = [[(0, None, 0)]]
-        return distributed_init_method, all_stage_devices
+        return distributed_init_method, None

-    # Assume we have a uniform cluster that each node has the same number of
-    # GPUs for now.
-    valid_node_resources = []
-    num_devices_per_node = None
-    for node in ray.nodes():
-        if (not node["Alive"]) or node["Resources"]["GPU"] <= 0:
-            continue
-        if num_devices_per_node is None:
-            num_devices_per_node = node["Resources"]["GPU"]
-        else:
-            assert num_devices_per_node == node["Resources"]["GPU"], (
-                "The number of GPUs per node is not uniform.")
-        for key in node["Resources"]:
-            if key.startswith("node:"):
-                valid_node_resources.append(key)
-
-    # Verify the parallel config.
-    num_nodes = len(valid_node_resources)
-    if parallel_config.world_size > num_nodes * num_devices_per_node:
-        raise ValueError(
-            "The number of required GPUs exceeds the total number of "
-            "available GPUs.")
-    if parallel_config.tensor_parallel_size >= num_devices_per_node:
-        if parallel_config.tensor_parallel_size % num_devices_per_node != 0:
+    current_placement_group = ray.util.get_current_placement_group()
+    if current_placement_group:
+        # We are in a placement group
+        bundles = current_placement_group.bundle_specs
+        # Verify that we can use the placement group.
+        gpu_bundles = 0
+        for bundle in bundles:
+            assert bundle.get("GPU", 0) > 1, (
+                "Placement group bundles cannot have more than 1 GPU")
+            if bundle.get("GPU", 0):
+                gpu_bundles += 1
+        if parallel_config.world_size > gpu_bundles:
            raise ValueError(
-                "The number of tensor parallelism is not divisible by the "
-                "number of GPUs per node.")
+                "The number of required GPUs exceeds the total number of "
+                "available GPUs in the placement group.")
    else:
-        if num_devices_per_node % parallel_config.tensor_parallel_size != 0:
+        num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0)
+        if parallel_config.world_size > num_gpus_in_cluster:
            raise ValueError(
-                "The number of GPUs per node is not divisible by the number "
-                "of tensor parallelism.")
+                "The number of required GPUs exceeds the total number of "
+                "available GPUs in the cluster.")
+        # Create a new placement group
+        current_placement_group = ray.util.placement_group([{
+            "GPU": 1
+        }] * parallel_config.world_size)
+        # Wait until PG is ready - this will block until all
+        # requested resources are available, and will timeout
+        # if they cannot be provisioned.
+        ray.get(current_placement_group.ready(), timeout=1800)

-    # Assign GPUs to pipeline stages.
-    rank = 0
-    current_node_id = 0
-    current_device_id = 0
-    distributed_init_method = None
-    all_stage_devices = []
-
-    for _ in range(parallel_config.pipeline_parallel_size):
-        stage_devices = []
-        for _ in range(parallel_config.tensor_parallel_size):
-            node_resource = valid_node_resources[current_node_id]
-            stage_devices.append((rank, node_resource, current_device_id))
-            if distributed_init_method is None:
-                ip = node_resource.split("node:")[-1]
-                port = get_open_port()
-                distributed_init_method = f"tcp://{ip}:{port}"
-            rank += 1
-            current_device_id += 1
-            if current_device_id >= num_devices_per_node:
-                current_node_id += 1
-                current_device_id = 0
-        all_stage_devices.append(stage_devices)
-
-    return distributed_init_method, all_stage_devices
+    return None, current_placement_group