[distributed][misc] error on same VLLM_HOST_IP setting (#7756)

This commit is contained in:
youkaichao
2024-08-21 16:25:34 -07:00
committed by GitHub
parent 8678a69ab5
commit 7eebe8ccaa
2 changed files with 17 additions and 1 deletions

View File

@@ -218,6 +218,19 @@ class RayGPUExecutor(DistributedGPUExecutor):
for node_id, gpu_ids in node_gpus.items():
node_gpus[node_id] = sorted(gpu_ids)
all_ips = set(worker_ips + [driver_ip])
n_ips = len(all_ips)
n_nodes = len(node_workers)
if n_nodes != n_ips:
raise RuntimeError(
f"Every node should have a unique IP address. Got {n_nodes}"
f" nodes with node ids {list(node_workers.keys())} and "
f"{n_ips} unique IP addresses {all_ips}. Please check your"
" network configuration. If you set `VLLM_HOST_IP` or "
"`HOST_IP` environment variable, make sure it is unique for"
" each node.")
VLLM_INSTANCE_ID = get_vllm_instance_id()
# Set environment variables for the driver and workers.