[Bugfix] Improve GPU validation logging in Ray fallback scenarios (#25775)

Signed-off-by: Sairam Pillai <sairam.pillai61@gmail.com>
2025-10-30 17:27:59 +05:30
parent c01f6e525f
commit 74374386e2
2 changed files with 49 additions and 15 deletions
--- a/vllm/v1/executor/ray_utils.py
+++ b/vllm/v1/executor/ray_utils.py
@@ -255,12 +255,33 @@ def _wait_until_pg_ready(current_placement_group: "PlacementGroup"):
    try:
        ray.get(pg_ready_ref, timeout=0)
    except ray.exceptions.GetTimeoutError:
-        raise ValueError(
-            "Cannot provide a placement group of "
-            f"{placement_group_specs=} within {PG_WAIT_TIMEOUT} seconds. See "
-            "`ray status` and `ray list nodes` to make sure the cluster has "
-            "enough resources."
-        ) from None
+        # Provide more helpful error message when GPU count is exceeded
+        total_gpu_required = sum(spec.get("GPU", 0) for spec in placement_group_specs)
+        # If more than one GPU is required for the placement group, provide a
+        # more specific error message.
+        # We use >1 here because multi-GPU (tensor parallel) jobs are more
+        # likely to fail due to insufficient cluster resources, and users may
+        # need to adjust tensor_parallel_size to fit available GPUs.
+        if total_gpu_required > 1:
+            raise ValueError(
+                f"Cannot provide a placement group requiring "
+                f"{total_gpu_required} GPUs "
+                f"(placement_group_specs={placement_group_specs}) within "
+                f"{PG_WAIT_TIMEOUT} seconds.\n"
+                f"Tensor parallel size may exceed available GPUs in your "
+                f"cluster. Check resources with `ray status` and "
+                f"`ray list nodes`.\n"
+                f"If running on K8s with limited GPUs, consider reducing "
+                f"--tensor-parallel-size to match available GPU resources."
+            ) from None
+        else:
+            raise ValueError(
+                "Cannot provide a placement group of "
+                f"{placement_group_specs=} within "
+                f"{PG_WAIT_TIMEOUT} seconds. See "
+                "`ray status` and `ray list nodes` to make sure the cluster "
+                "has enough resources."
+            ) from None


 def _wait_until_pg_removed(current_placement_group: "PlacementGroup"):
@@ -299,6 +320,23 @@ def initialize_ray_cluster(
    assert_ray_available()
    from vllm.platforms import current_platform

+    # Prevalidate GPU requirements before Ray processing
+    if current_platform.is_cuda() and parallel_config.world_size > 1:
+        from vllm.utils import cuda_device_count_stateless
+
+        available_gpus = cuda_device_count_stateless()
+        if parallel_config.world_size > available_gpus:
+            logger.warning(
+                "Tensor parallel size (%d) exceeds available GPUs (%d). "
+                "This may result in Ray placement group allocation failures. "
+                "Consider reducing tensor_parallel_size to %d or less, "
+                "or ensure your Ray cluster has %d GPUs available.",
+                parallel_config.world_size,
+                available_gpus,
+                available_gpus,
+                parallel_config.world_size,
+            )
+
    if ray.is_initialized():
        logger.info("Ray is already initialized. Skipping Ray initialization.")
    elif current_platform.is_rocm() or current_platform.is_xpu():