diff --git a/.buildkite/.pipeline_gen_v2 b/.buildkite/.pipeline_gen_v2 new file mode 100644 index 000000000..e69de29bb diff --git a/.buildkite/image_build/image_build.sh b/.buildkite/image_build/image_build.sh index 9483ff2f2..f0bbaab77 100755 --- a/.buildkite/image_build/image_build.sh +++ b/.buildkite/image_build/image_build.sh @@ -143,7 +143,7 @@ resolve_parent_commit() { print_bake_config() { echo "--- :page_facing_up: Resolved bake configuration" BAKE_CONFIG_FILE="bake-config-build-${BUILDKITE_BUILD_NUMBER:-local}.json" - docker buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true + docker buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --print "${TARGET}" | tee "${BAKE_CONFIG_FILE}" || true echo "Saved bake config to ${BAKE_CONFIG_FILE}" echo "--- :arrow_down: Uploading bake config to Buildkite" buildkite-agent artifact upload "${BAKE_CONFIG_FILE}" @@ -170,9 +170,9 @@ IMAGE_TAG_LATEST=${8:-} # only used for main branch, optional # build config TARGET="test-ci" -CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}" -VLLM_BAKE_FILE="${VLLM_BAKE_FILE:-docker/docker-bake.hcl}" +VLLM_BAKE_FILE_PATH="${VLLM_BAKE_FILE_PATH:-docker/docker-bake.hcl}" BUILDER_NAME="${BUILDER_NAME:-vllm-builder}" +CI_HCL_URL="${CI_HCL_URL:-https://raw.githubusercontent.com/vllm-project/ci-infra/main/docker/ci.hcl}" CI_HCL_PATH="/tmp/ci.hcl" BUILDKIT_SOCKET="/run/buildkit/buildkitd.sock" @@ -180,9 +180,8 @@ prepare_cache_tags ecr_login # Environment info (for docs and human readers) -# CI_HCL_URL - URL to ci.hcl (default: from ci-infra main branch) # VLLM_CI_BRANCH - ci-infra branch to use (default: main) -# VLLM_BAKE_FILE - Path to vLLM's bake file (default: docker/docker-bake.hcl) +# VLLM_BAKE_FILE_PATH - Path to vLLM's bake file (default: docker/docker-bake.hcl) # BUILDER_NAME - Name for buildx builder (default: vllm-builder) # # Build configuration (exported as environment variables for bake): @@ -211,10 +210,9 @@ echo "IMAGE_TAG_LATEST: ${IMAGE_TAG_LATEST}" # print build configuration echo "--- :mag: Build configuration" echo "TARGET: ${TARGET}" -echo "CI HCL URL: ${CI_HCL_URL}" -echo "vLLM bake file: ${VLLM_BAKE_FILE}" +echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}" echo "BUILDER_NAME: ${BUILDER_NAME}" -echo "CI_HCL_PATH: ${CI_HCL_PATH}" +echo "CI_HCL_URL: ${CI_HCL_URL}" echo "BUILDKIT_SOCKET: ${BUILDKIT_SOCKET}" echo "--- :mag: Cache tags" @@ -227,11 +225,11 @@ check_and_skip_if_image_exists echo "--- :docker: Setting up Docker buildx bake" echo "Target: ${TARGET}" -echo "CI HCL URL: ${CI_HCL_URL}" -echo "vLLM bake file: ${VLLM_BAKE_FILE}" +echo "vLLM bake file: ${VLLM_BAKE_FILE_PATH}" +echo "CI HCL path: ${CI_HCL_PATH}" -if [[ ! -f "${VLLM_BAKE_FILE}" ]]; then - echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE}" +if [[ ! -f "${VLLM_BAKE_FILE_PATH}" ]]; then + echo "Error: vLLM bake file not found at ${VLLM_BAKE_FILE_PATH}" echo "Make sure you're running from the vLLM repository root" exit 1 fi @@ -240,15 +238,19 @@ echo "--- :arrow_down: Downloading ci.hcl" curl -sSfL -o "${CI_HCL_PATH}" "${CI_HCL_URL}" echo "Downloaded to ${CI_HCL_PATH}" +if [[ ! -f "${CI_HCL_PATH}" ]]; then + echo "Error: ci.hcl not found at ${CI_HCL_PATH}" + exit 1 +fi + setup_buildx_builder -# Compute parent commit for cache fallback (if not already set) resolve_parent_commit export PARENT_COMMIT print_bake_config echo "--- :docker: Building ${TARGET}" -docker --debug buildx bake -f "${VLLM_BAKE_FILE}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}" +docker --debug buildx bake -f "${VLLM_BAKE_FILE_PATH}" -f "${CI_HCL_PATH}" --progress plain "${TARGET}" echo "--- :white_check_mark: Build complete" diff --git a/.buildkite/image_build/image_build.yaml b/.buildkite/image_build/image_build.yaml index 6f601d384..163dd68c8 100644 --- a/.buildkite/image_build/image_build.yaml +++ b/.buildkite/image_build/image_build.yaml @@ -5,7 +5,7 @@ steps: depends_on: [] commands: - if [[ "$BUILDKITE_BRANCH" != "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG; fi - - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG_LATEST; fi + - if [[ "$BUILDKITE_BRANCH" == "main" ]]; then .buildkite/image_build/image_build.sh $REGISTRY $REPO $BUILDKITE_COMMIT $BRANCH $VLLM_USE_PRECOMPILED $VLLM_MERGE_BASE_COMMIT $IMAGE_TAG $IMAGE_TAG_LATEST; fi retry: automatic: - exit_status: -1 # Agent was lost diff --git a/.buildkite/test_areas/distributed.yaml b/.buildkite/test_areas/distributed.yaml index 57eabb6e4..51e1de3f0 100644 --- a/.buildkite/test_areas/distributed.yaml +++ b/.buildkite/test_areas/distributed.yaml @@ -180,7 +180,7 @@ steps: - tests/distributed/ - tests/examples/offline_inference/data_parallel.py commands: - - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 public.ecr.aws/q9t5s3a7/vllm-ci-postmerge-repo:0bec63fa317e1fbd62e19b0fc31c43c81bf89077 "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code" + - ./.buildkite/scripts/run-multi-node-test.sh /vllm-workspace/tests 2 2 $IMAGE_TAG "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=0 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_multi_node_assignment.py && VLLM_MULTI_NODE=1 pytest -v -s distributed/test_pipeline_parallel.py" "VLLM_TEST_SAME_HOST=0 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_same_node.py | grep 'Same node test passed' && NUM_NODES=2 torchrun --nnodes 2 --nproc-per-node=2 --rdzv_backend=c10d --rdzv_endpoint=192.168.10.10 distributed/test_node_count.py | grep 'Node count test passed' && python3 ../examples/offline_inference/data_parallel.py -dp=2 -tp=1 --dp-num-nodes=2 --dp-node-rank=1 --dp-master-addr=192.168.10.10 --dp-master-port=12345 --enforce-eager --trust-remote-code" - label: Distributed NixlConnector PD accuracy (4 GPUs) timeout_in_minutes: 30