[misc] Mention ray list nodes command to troubleshoot ray issues (#14318)

Signed-off-by: Rui Qiao <ruisearch42@gmail.com>
This commit is contained in:
Rui Qiao
2025-03-05 18:00:36 -08:00
committed by GitHub
parent f6bb18fd9a
commit abcc61e0af
2 changed files with 9 additions and 8 deletions

View File

@@ -184,8 +184,9 @@ def _verify_bundles(placement_group: "PlacementGroup",
f"group {placement_group.id}. Node id -> bundles "
f"{node_id_to_bundle}. "
"You don't have enough GPUs available in a current node. Check "
"`ray status` to see if you have available GPUs in a node "
f"{driver_node_id} before starting an vLLM engine.")
"`ray status` and `ray list nodes` to see if you have available "
"GPUs in a node `{driver_node_id}` before starting an vLLM engine."
)
for node_id, bundles in node_id_to_bundle.items():
if len(bundles) < parallel_config.tensor_parallel_size:
@@ -225,8 +226,8 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
wait_interval *= 2
logger.info(
"Waiting for creating a placement group of specs for "
"%d seconds. specs=%s. Check "
"`ray status` to see if you have enough resources,"
"%d seconds. specs=%s. Check `ray status` and "
"`ray list nodes` to see if you have enough resources,"
" and make sure the IP addresses used by ray cluster"
" are the same as VLLM_HOST_IP environment variable"
" specified in each node if you are running on a multi-node.",
@@ -238,8 +239,8 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
raise ValueError(
"Cannot provide a placement group of "
f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
"`ray status` to make sure the cluster has enough resources."
) from None
"`ray status` and `ray list nodes` to make sure the cluster has "
"enough resources.") from None
def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):