[misc] Mention ray list nodes command to troubleshoot ray issues (#14318)
Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
This commit is contained in:
@@ -184,8 +184,9 @@ def _verify_bundles(placement_group: "PlacementGroup",
|
||||
f"group {placement_group.id}. Node id -> bundles "
|
||||
f"{node_id_to_bundle}. "
|
||||
"You don't have enough GPUs available in a current node. Check "
|
||||
"`ray status` to see if you have available GPUs in a node "
|
||||
f"{driver_node_id} before starting an vLLM engine.")
|
||||
"`ray status` and `ray list nodes` to see if you have available "
|
||||
"GPUs in a node `{driver_node_id}` before starting an vLLM engine."
|
||||
)
|
||||
|
||||
for node_id, bundles in node_id_to_bundle.items():
|
||||
if len(bundles) < parallel_config.tensor_parallel_size:
|
||||
@@ -225,8 +226,8 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
|
||||
wait_interval *= 2
|
||||
logger.info(
|
||||
"Waiting for creating a placement group of specs for "
|
||||
"%d seconds. specs=%s. Check "
|
||||
"`ray status` to see if you have enough resources,"
|
||||
"%d seconds. specs=%s. Check `ray status` and "
|
||||
"`ray list nodes` to see if you have enough resources,"
|
||||
" and make sure the IP addresses used by ray cluster"
|
||||
" are the same as VLLM_HOST_IP environment variable"
|
||||
" specified in each node if you are running on a multi-node.",
|
||||
@@ -238,8 +239,8 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
|
||||
raise ValueError(
|
||||
"Cannot provide a placement group of "
|
||||
f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
|
||||
"`ray status` to make sure the cluster has enough resources."
|
||||
) from None
|
||||
"`ray status` and `ray list nodes` to make sure the cluster has "
|
||||
"enough resources.") from None
|
||||
|
||||
|
||||
def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):
|
||||
|
||||
Reference in New Issue
Block a user