[doc][distributed] doc for setting up multi-node environment (#6529)
[doc][distributed] add more doc for setting up multi-node environment (#6529)
This commit is contained in:
49
examples/run_cluster.sh
Normal file
49
examples/run_cluster.sh
Normal file
@@ -0,0 +1,49 @@
|
||||
#!/bin/bash
|
||||
|
||||
# Check for minimum number of required arguments
|
||||
if [ $# -lt 4 ]; then
|
||||
echo "Usage: $0 docker_image head_node_address --head|--worker path_to_hf_home [additional_args...]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Assign the first three arguments and shift them away
|
||||
DOCKER_IMAGE="$1"
|
||||
HEAD_NODE_ADDRESS="$2"
|
||||
NODE_TYPE="$3" # Should be --head or --worker
|
||||
PATH_TO_HF_HOME="$4"
|
||||
shift 4
|
||||
|
||||
# Additional arguments are passed directly to the Docker command
|
||||
ADDITIONAL_ARGS="$@"
|
||||
|
||||
# Validate node type
|
||||
if [ "${NODE_TYPE}" != "--head" ] && [ "${NODE_TYPE}" != "--worker" ]; then
|
||||
echo "Error: Node type must be --head or --worker"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Define a function to cleanup on EXIT signal
|
||||
cleanup() {
|
||||
docker stop node
|
||||
docker rm node
|
||||
}
|
||||
trap cleanup EXIT
|
||||
|
||||
# Command setup for head or worker node
|
||||
RAY_START_CMD="ray start --block"
|
||||
if [ "${NODE_TYPE}" == "--head" ]; then
|
||||
RAY_START_CMD+=" --head --port=6379"
|
||||
else
|
||||
RAY_START_CMD+=" --address=${HEAD_NODE_ADDRESS}:6379"
|
||||
fi
|
||||
|
||||
# Run the docker command with the user specified parameters and additional arguments
|
||||
docker run \
|
||||
--entrypoint /bin/bash \
|
||||
--network host \
|
||||
--name node \
|
||||
--shm-size 10.24g \
|
||||
--gpus all \
|
||||
-v "${PATH_TO_HF_HOME}:/root/.cache/huggingface" \
|
||||
${ADDITIONAL_ARGS} \
|
||||
"${DOCKER_IMAGE}" -c "${RAY_START_CMD}"
|
||||
Reference in New Issue
Block a user