Ray placement group support (#397)
This commit is contained in:
@@ -1,15 +1,35 @@
|
||||
import socket
|
||||
from typing import List, Optional, Tuple
|
||||
|
||||
try:
|
||||
import ray
|
||||
except ImportError:
|
||||
ray = None
|
||||
from typing import Optional, Tuple, TYPE_CHECKING
|
||||
|
||||
from vllm.config import ParallelConfig
|
||||
|
||||
# rank, node resource (node IP), device id
|
||||
DeviceID = Tuple[int, Optional[str], int]
|
||||
try:
|
||||
import ray
|
||||
from ray.air.util.torch_dist import TorchDistributedWorker
|
||||
|
||||
class RayWorker(TorchDistributedWorker):
|
||||
"""Ray wrapper for vllm.worker.Worker, allowing Worker to be
|
||||
lazliy initialized after Ray sets CUDA_VISIBLE_DEVICES."""
|
||||
|
||||
def __init__(self) -> None:
|
||||
self.worker = None
|
||||
|
||||
def init_worker(self, worker_init_fn):
|
||||
self.worker = worker_init_fn()
|
||||
|
||||
def __getattr__(self, name):
|
||||
return getattr(self.worker, name)
|
||||
|
||||
def execute_method(self, method, *args, **kwargs):
|
||||
executor = getattr(self, method)
|
||||
return executor(*args, **kwargs)
|
||||
|
||||
except ImportError:
|
||||
ray = None
|
||||
TorchDistributedWorker = None
|
||||
|
||||
if TYPE_CHECKING:
|
||||
from ray.util.placement_group import PlacementGroup
|
||||
|
||||
|
||||
def get_open_port():
|
||||
@@ -22,7 +42,7 @@ def initialize_cluster(
|
||||
parallel_config: ParallelConfig,
|
||||
engine_use_ray: bool = False,
|
||||
ray_address: Optional[str] = None,
|
||||
) -> Tuple[str, List[List[DeviceID]]]:
|
||||
) -> Tuple[str, Optional["PlacementGroup"]]:
|
||||
"""Initialize the distributed cluster probably with Ray.
|
||||
|
||||
Args:
|
||||
@@ -52,63 +72,36 @@ def initialize_cluster(
|
||||
# We need to setup the distributed init method to make sure
|
||||
# the distributed megatron code (e.g., get world size) works correctly.
|
||||
distributed_init_method = f"tcp://localhost:{port}"
|
||||
all_stage_devices = [[(0, None, 0)]]
|
||||
return distributed_init_method, all_stage_devices
|
||||
return distributed_init_method, None
|
||||
|
||||
# Assume we have a uniform cluster that each node has the same number of
|
||||
# GPUs for now.
|
||||
valid_node_resources = []
|
||||
num_devices_per_node = None
|
||||
for node in ray.nodes():
|
||||
if (not node["Alive"]) or node["Resources"]["GPU"] <= 0:
|
||||
continue
|
||||
if num_devices_per_node is None:
|
||||
num_devices_per_node = node["Resources"]["GPU"]
|
||||
else:
|
||||
assert num_devices_per_node == node["Resources"]["GPU"], (
|
||||
"The number of GPUs per node is not uniform.")
|
||||
for key in node["Resources"]:
|
||||
if key.startswith("node:"):
|
||||
valid_node_resources.append(key)
|
||||
|
||||
# Verify the parallel config.
|
||||
num_nodes = len(valid_node_resources)
|
||||
if parallel_config.world_size > num_nodes * num_devices_per_node:
|
||||
raise ValueError(
|
||||
"The number of required GPUs exceeds the total number of "
|
||||
"available GPUs.")
|
||||
if parallel_config.tensor_parallel_size >= num_devices_per_node:
|
||||
if parallel_config.tensor_parallel_size % num_devices_per_node != 0:
|
||||
current_placement_group = ray.util.get_current_placement_group()
|
||||
if current_placement_group:
|
||||
# We are in a placement group
|
||||
bundles = current_placement_group.bundle_specs
|
||||
# Verify that we can use the placement group.
|
||||
gpu_bundles = 0
|
||||
for bundle in bundles:
|
||||
assert bundle.get("GPU", 0) > 1, (
|
||||
"Placement group bundles cannot have more than 1 GPU")
|
||||
if bundle.get("GPU", 0):
|
||||
gpu_bundles += 1
|
||||
if parallel_config.world_size > gpu_bundles:
|
||||
raise ValueError(
|
||||
"The number of tensor parallelism is not divisible by the "
|
||||
"number of GPUs per node.")
|
||||
"The number of required GPUs exceeds the total number of "
|
||||
"available GPUs in the placement group.")
|
||||
else:
|
||||
if num_devices_per_node % parallel_config.tensor_parallel_size != 0:
|
||||
num_gpus_in_cluster = ray.cluster_resources().get("GPU", 0)
|
||||
if parallel_config.world_size > num_gpus_in_cluster:
|
||||
raise ValueError(
|
||||
"The number of GPUs per node is not divisible by the number "
|
||||
"of tensor parallelism.")
|
||||
"The number of required GPUs exceeds the total number of "
|
||||
"available GPUs in the cluster.")
|
||||
# Create a new placement group
|
||||
current_placement_group = ray.util.placement_group([{
|
||||
"GPU": 1
|
||||
}] * parallel_config.world_size)
|
||||
# Wait until PG is ready - this will block until all
|
||||
# requested resources are available, and will timeout
|
||||
# if they cannot be provisioned.
|
||||
ray.get(current_placement_group.ready(), timeout=1800)
|
||||
|
||||
# Assign GPUs to pipeline stages.
|
||||
rank = 0
|
||||
current_node_id = 0
|
||||
current_device_id = 0
|
||||
distributed_init_method = None
|
||||
all_stage_devices = []
|
||||
|
||||
for _ in range(parallel_config.pipeline_parallel_size):
|
||||
stage_devices = []
|
||||
for _ in range(parallel_config.tensor_parallel_size):
|
||||
node_resource = valid_node_resources[current_node_id]
|
||||
stage_devices.append((rank, node_resource, current_device_id))
|
||||
if distributed_init_method is None:
|
||||
ip = node_resource.split("node:")[-1]
|
||||
port = get_open_port()
|
||||
distributed_init_method = f"tcp://{ip}:{port}"
|
||||
rank += 1
|
||||
current_device_id += 1
|
||||
if current_device_id >= num_devices_per_node:
|
||||
current_node_id += 1
|
||||
current_device_id = 0
|
||||
all_stage_devices.append(stage_devices)
|
||||
|
||||
return distributed_init_method, all_stage_devices
|
||||
return None, current_placement_group
|
||||
|
||||
Reference in New Issue
Block a user