[Distributed] Allow the placement group more time to wait for resources to be ready (#11138)
Signed-off-by: Jiaxin Shan <seedjeffwan@gmail.com>
This commit is contained in:
@@ -277,10 +277,14 @@ def initialize_ray_cluster(
|
|||||||
f"Total number of devices: {device_bundles}.")
|
f"Total number of devices: {device_bundles}.")
|
||||||
else:
|
else:
|
||||||
num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
|
num_devices_in_cluster = ray.cluster_resources().get(device_str, 0)
|
||||||
|
# Log a warning message and delay resource allocation failure response.
|
||||||
|
# Avoid immediate rejection to allow user-initiated placement group
|
||||||
|
# created and wait cluster to be ready
|
||||||
if parallel_config.world_size > num_devices_in_cluster:
|
if parallel_config.world_size > num_devices_in_cluster:
|
||||||
raise ValueError(
|
logger.warning(
|
||||||
f"The number of required {device_str}s exceeds the total "
|
"The number of required %ss exceeds the total "
|
||||||
f"number of available {device_str}s in the placement group.")
|
"number of available %ss in the placement group.", device_str,
|
||||||
|
device_str)
|
||||||
# Create a new placement group
|
# Create a new placement group
|
||||||
placement_group_specs: List[Dict[str, float]] = ([{
|
placement_group_specs: List[Dict[str, float]] = ([{
|
||||||
device_str: 1.0
|
device_str: 1.0
|
||||||
|
|||||||
Reference in New Issue
Block a user