diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index a543c5966..5ccc55403 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -342,184 +342,143 @@ steps: # To build a specific version, trigger the build from that branch/tag. # # Environment variables for ROCm builds (set via Buildkite UI or schedule): - # ROCM_PYTHON_VERSION: Python version (default: 3.12) - # PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151) - # ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases) - # ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false) # # Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base - # (currently rocm/dev-ubuntu-22.04:7.1-complete) # # ============================================================================= - # ROCm Input Step - Collect build configuration (manual trigger only) - - input: "ROCm Wheel Release Build Configuration" - key: input-rocm-config - depends_on: ~ - if: build.source == "ui" - fields: - - text: "Python Version" - key: "rocm-python-version" - default: "3.12" - hint: "Python version (e.g., 3.12)" - - text: "GPU Architectures" - key: "rocm-pytorch-rocm-arch" - default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151" - hint: "Semicolon-separated GPU architectures" - - select: "Upload Wheels to S3" - key: "rocm-upload-wheels" - default: "true" - options: - - label: "No - Build only (nightly/dev)" - value: "false" - - label: "Yes - Upload to S3 (release)" - value: "true" - - select: "Force Rebuild Base Wheels" - key: "rocm-force-rebuild" - default: "false" - hint: "Ignore S3 cache and rebuild base wheels from scratch" - options: - - label: "No - Use cached wheels if available" - value: "false" - - label: "Yes - Rebuild even if cache exists" - value: "true" - # ROCm Job 1: Build ROCm Base Wheels (with S3 caching) - - label: ":rocm: Build ROCm Base Wheels" + - label: ":rocm: Build ROCm Base Image & Wheels" id: build-rocm-base-wheels - depends_on: - - step: input-rocm-config - allow_failure: true # Allow failure so non-UI builds can proceed (input step is skipped) + depends_on: ~ agents: queue: cpu_queue_release commands: - # Set configuration and check cache - | set -euo pipefail - # Get values from meta-data (set by input step) or use defaults - PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')" - export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}" - - PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')" - export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}" - - # Check for force rebuild flag - ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}" - if [ -z "$${ROCM_FORCE_REBUILD}" ]; then - ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')" - fi - - echo "========================================" - echo "ROCm Base Wheels Build Configuration" - echo "========================================" - echo " PYTHON_VERSION: $${PYTHON_VERSION}" - echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}" - echo " ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}" - echo "========================================" - - # Save resolved config for later jobs - buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}" - buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}" - - # Check S3 cache for pre-built wheels + # Generate cache key CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key) - CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path) - echo "" - echo "Cache key: $${CACHE_KEY}" - echo "Cache path: $${CACHE_PATH}" + ECR_CACHE_TAG="public.ecr.aws/q9t5s3a7/vllm-release-repo:$${CACHE_KEY}-rocm-base" - # Save cache key for downstream jobs - buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}" + echo "========================================" + echo "ROCm Base Build Configuration" + echo "========================================" + echo " CACHE_KEY: $${CACHE_KEY}" + echo " ECR_CACHE_TAG: $${ECR_CACHE_TAG}" + echo "========================================" + + # Login to ECR + aws ecr-public get-login-password --region us-east-1 | \ + docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 + + IMAGE_EXISTS=false + WHEELS_EXIST=false + + # Check ECR for Docker image - CACHE_STATUS="miss" - if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then - CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check) - else - echo "Force rebuild requested, skipping cache check" + if docker manifest inspect "$${ECR_CACHE_TAG}" > /dev/null 2>&1; then + IMAGE_EXISTS=true + echo "ECR image cache HIT" + fi + + # Check S3 for wheels + WHEEL_CACHE_STATUS=$(.buildkite/scripts/cache-rocm-base-wheels.sh check) + if [ "$${WHEEL_CACHE_STATUS}" = "hit" ]; then + WHEELS_EXIST=true + echo "S3 wheels cache HIT" fi - if [ "$${CACHE_STATUS}" = "hit" ]; then + + # Scenario 1: Both cached (best case) + if [ "$${IMAGE_EXISTS}" = "true" ] && [ "$${WHEELS_EXIST}" = "true" ]; then echo "" - echo "CACHE HIT! Downloading pre-built wheels..." + echo "FULL CACHE HIT - Reusing both image and wheels" echo "" + + # Download wheels .buildkite/scripts/cache-rocm-base-wheels.sh download - - # Set the S3 path for the cached Docker image (for Job 2 to download) - S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}" - buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz" - - # Mark that we used cache (for Docker image handling) - buildkite-agent meta-data set "rocm-used-cache" "true" - + + # Save ECR tag for downstream jobs + buildkite-agent meta-data set "rocm-base-image-tag" "$${ECR_CACHE_TAG}" + + # Scenario 2: Image cached but wheels missing + elif [ "$${IMAGE_EXISTS}" = "true" ] && [ "$${WHEELS_EXIST}" = "false" ]; then echo "" - echo "Cache download complete. Skipping Docker build." - echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz" - else + echo "PARTIAL CACHE - Image exists, extracting wheels..." echo "" - echo "CACHE MISS. Building from scratch..." - echo "" - - # Build full base image (for later vLLM build) - DOCKER_BUILDKIT=1 docker buildx build \ - --file docker/Dockerfile.rocm_base \ - --tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \ - --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ - --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \ - --build-arg USE_SCCACHE=1 \ - --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ - --build-arg SCCACHE_REGION_NAME=us-west-2 \ - --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ - --load \ - . - - # Build debs_wheel_release stage for wheel extraction + + docker pull "$${ECR_CACHE_TAG}" + + # Rebuild wheel extraction stage DOCKER_BUILDKIT=1 docker buildx build \ --file docker/Dockerfile.rocm_base \ --tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \ + --build-arg BASE_IMAGE="$${ECR_IMAGE_TAG}" \ --target debs_wheel_release \ - --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ - --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \ --build-arg USE_SCCACHE=1 \ --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ --build-arg SCCACHE_REGION_NAME=us-west-2 \ --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ --load \ . - - # Extract wheels from Docker image + + # Extract and upload wheels mkdir -p artifacts/rocm-base-wheels - container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER}) - docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/ - docker rm $${container_id} - echo "Extracted base wheels:" - ls -lh artifacts/rocm-base-wheels/ - - # Upload wheels to S3 cache for future builds + cid=$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER}) + docker cp $${cid}:/app/debs/. artifacts/rocm-base-wheels/ + docker rm $${cid} + + .buildkite/scripts/cache-rocm-base-wheels.sh upload + + buildkite-agent meta-data set "rocm-base-image-tag" "$${ECR_CACHE_TAG}" + + # Scenario 3: Full rebuild needed + else echo "" - echo "Uploading wheels to S3 cache..." + echo " CACHE MISS - Building from scratch..." + echo "" + + # Build full base image and push to ECR + DOCKER_BUILDKIT=1 docker buildx build \ + --file docker/Dockerfile.rocm_base \ + --tag "$${ECR_CACHE_TAG}" \ + --build-arg USE_SCCACHE=1 \ + --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ + --build-arg SCCACHE_REGION_NAME=us-west-2 \ + --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ + --push \ + . + + # Build wheel extraction stage + DOCKER_BUILDKIT=1 docker buildx build \ + --file docker/Dockerfile.rocm_base \ + --tag rocm-base-debs:${BUILDKITE_BUILD_NUMBER} \ + --target debs_wheel_release \ + --build-arg USE_SCCACHE=1 \ + --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ + --build-arg SCCACHE_REGION_NAME=us-west-2 \ + --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ + --load \ + . + + # Extract and upload wheels + mkdir -p artifacts/rocm-base-wheels + cid=$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER}) + docker cp $${cid}:/app/debs/. artifacts/rocm-base-wheels/ + docker rm $${cid} + .buildkite/scripts/cache-rocm-base-wheels.sh upload - # Export base Docker image for reuse in vLLM build - mkdir -p artifacts/rocm-docker-image - docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz - echo "Docker image size:" - ls -lh artifacts/rocm-docker-image/ - - # Upload large Docker image to S3 (also cached by cache key) - S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}" - echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/" - aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz" - - # Save the S3 path for downstream jobs - buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz" - - # Mark that we did NOT use cache - buildkite-agent meta-data set "rocm-used-cache" "false" - + # Cache base docker image to ECR + docker push "$${ECR_CACHE_TAG}" + + buildkite-agent meta-data set "rocm-base-image-tag" "$${ECR_CACHE_TAG}" + echo "" - echo "Build complete. Wheels cached for future builds." + echo " Build complete - Image and wheels cached" fi + artifact_paths: - "artifacts/rocm-base-wheels/*.whl" env: @@ -563,31 +522,25 @@ steps: echo "Downloading wheel artifacts from current build" buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" . - # Download Docker image from S3 (too large for Buildkite artifacts) - DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')" - if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then - echo "ERROR: rocm-docker-image-s3-path metadata not found" + # Get ECR image tag from metadata (set by build-rocm-base-wheels) + ECR_IMAGE_TAG="$$(buildkite-agent meta-data get rocm-base-image-tag 2>/dev/null || echo '')" + if [ -z "$${ECR_IMAGE_TAG}" ]; then + echo "ERROR: rocm-base-image-tag metadata not found" echo "This should have been set by the build-rocm-base-wheels job" exit 1 fi - echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}" - mkdir -p artifacts/rocm-docker-image - aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz - - # Load base Docker image and capture the tag - echo "Loading base Docker image..." - LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load) - echo "$${LOAD_OUTPUT}" - # Extract the actual loaded image tag from "Loaded image: " output - # This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent - BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //') - if [ -z "$${BASE_IMAGE_TAG}" ]; then - echo "ERROR: Failed to extract image tag from docker load output" - echo "Load output was: $${LOAD_OUTPUT}" - exit 1 - fi - echo "Loaded base image: $${BASE_IMAGE_TAG}" - + + echo "Pulling base Docker image from ECR: $${ECR_IMAGE_TAG}" + + # Login to ECR + aws ecr-public get-login-password --region us-east-1 | \ + docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 + + # Pull base Docker image from ECR + docker pull "$${ECR_IMAGE_TAG}" + + echo "Loaded base image: $${ECR_IMAGE_TAG}" + # Prepare base wheels for Docker build context mkdir -p docker/context/base-wheels touch docker/context/base-wheels/.keep @@ -595,16 +548,11 @@ steps: echo "Base wheels for vLLM build:" ls -lh docker/context/base-wheels/ - # Get GPU architectures from meta-data - PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')" - PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}" - echo "========================================" echo "Building vLLM wheel with:" echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}" echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}" - echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}" - echo " BASE_IMAGE: $${BASE_IMAGE_TAG}" + echo " BASE_IMAGE: $${ECR_IMAGE_TAG}" echo "========================================" # Build vLLM wheel using local checkout (REMOTE_VLLM=0) @@ -612,8 +560,7 @@ steps: --file docker/Dockerfile.rocm \ --target export_vllm_wheel_release \ --output type=local,dest=rocm-dist \ - --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \ - --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ + --build-arg BASE_IMAGE="$${ECR_IMAGE_TAG}" \ --build-arg REMOTE_VLLM=0 \ --build-arg GIT_REPO_CHECK=1 \ --build-arg USE_SCCACHE=1 \ @@ -621,10 +568,8 @@ steps: --build-arg SCCACHE_REGION_NAME=us-west-2 \ --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ . - echo "Built vLLM wheel:" ls -lh rocm-dist/*.whl - # Copy wheel to artifacts directory mkdir -p artifacts/rocm-vllm-wheel cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/ @@ -650,28 +595,6 @@ steps: - | set -euo pipefail - # Check if upload is enabled (from env var, meta-data, or release branch) - ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}" - if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then - # Try to get from meta-data (input form) - ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')" - fi - - echo "========================================" - echo "Upload check:" - echo " ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}" - echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}" - echo "========================================" - - # Skip upload if not enabled - if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then - echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)" - echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration" - exit 0 - fi - - echo "Upload enabled, proceeding..." - # Download artifacts from current build echo "Downloading artifacts from current build" buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" . @@ -687,10 +610,7 @@ steps: - label: ":memo: Annotate ROCm wheel release" id: annotate-rocm-release depends_on: - - step: upload-rocm-wheels - allow_failure: true - - step: input-release-version - allow_failure: true + - upload-rocm-wheels agents: queue: cpu_queue_release commands: @@ -716,7 +636,7 @@ steps: S3_BUCKET: "vllm-wheels" VARIANT: "rocm700" - # ROCm Job 5: Build ROCm Release Docker Image + # ROCm Job 6: Build ROCm Release Docker Image - label: ":docker: Build release image - x86_64 - ROCm" id: build-rocm-release-image depends_on: @@ -728,42 +648,39 @@ steps: commands: - | set -euo pipefail - + # Login to ECR aws ecr-public get-login-password --region us-east-1 | \ docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 - - # Download Docker image from S3 (set by build-rocm-base-wheels) - DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')" - if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then - echo "ERROR: rocm-docker-image-s3-path metadata not found" + + # Get ECR image tag from metadata (set by build-rocm-base-wheels) + ECR_IMAGE_TAG="$$(buildkite-agent meta-data get rocm-base-image-tag 2>/dev/null || echo '')" + if [ -z "$${ECR_IMAGE_TAG}" ]; then + echo "ERROR: rocm-base-image-tag metadata not found" + echo "This should have been set by the build-rocm-base-wheels job" exit 1 fi - - echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}" - mkdir -p artifacts/rocm-docker-image - aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz - - # Load base Docker image - echo "Loading base Docker image..." - LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load) - BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //') - echo "Loaded base image: $${BASE_IMAGE_TAG}" - - # Tag and push the base image to ECR - docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base - docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base - echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base" - - # Get GPU architectures from meta-data - PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')" - PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}" - + + echo "Pulling base Docker image from ECR: $${ECR_IMAGE_TAG}" + + # Pull base Docker image from ECR + docker pull "$${ECR_IMAGE_TAG}" + + echo "Loaded base image: $${ECR_IMAGE_TAG}" + + # Pass the base image ECR tag to downstream steps (nightly publish) + buildkite-agent meta-data set "rocm-base-ecr-tag" "$${ECR_IMAGE_TAG}" + + echo "========================================" + echo "Building vLLM ROCm release image with:" + echo " BASE_IMAGE: $${ECR_IMAGE_TAG}" + echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}" + echo "========================================" + # Build vLLM ROCm release image using cached base DOCKER_BUILDKIT=1 docker build \ --build-arg max_jobs=16 \ - --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \ - --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ + --build-arg BASE_IMAGE="$${ECR_IMAGE_TAG}" \ --build-arg USE_SCCACHE=1 \ --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ --build-arg SCCACHE_REGION_NAME=us-west-2 \ @@ -772,10 +689,32 @@ steps: --target vllm-openai \ --progress plain \ -f docker/Dockerfile.rocm . - + # Push to ECR docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm - echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm" + + echo "" + echo " Successfully built and pushed ROCm release image" + echo " Image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm" + echo "" env: DOCKER_BUILDKIT: "1" S3_BUCKET: "vllm-wheels" + + - label: "Publish nightly ROCm image to DockerHub" + depends_on: + - build-rocm-release-image + agents: + queue: small_cpu_queue_release + commands: + - "bash .buildkite/scripts/push-nightly-builds-rocm.sh" + # Clean up old nightly builds (keep only last 14) + - "bash .buildkite/scripts/cleanup-nightly-builds.sh nightly- vllm/vllm-openai-rocm" + - "bash .buildkite/scripts/cleanup-nightly-builds.sh base-nightly- vllm/vllm-openai-rocm" + plugins: + - docker-login#v3.0.0: + username: vllmbot + password-env: DOCKERHUB_TOKEN + env: + DOCKER_BUILDKIT: "1" + DOCKERHUB_USERNAME: "vllmbot" diff --git a/.buildkite/scripts/annotate-release.sh b/.buildkite/scripts/annotate-release.sh index fe73ea642..2da9db2f2 100755 --- a/.buildkite/scripts/annotate-release.sh +++ b/.buildkite/scripts/annotate-release.sh @@ -8,6 +8,8 @@ if [ -z "${RELEASE_VERSION}" ]; then RELEASE_VERSION="1.0.0.dev" fi +ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key) + buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF To download the wheel (by commit): \`\`\` @@ -33,7 +35,7 @@ docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130 docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130 -docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION} docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION} @@ -74,7 +76,7 @@ docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RE docker push vllm/vllm-openai-rocm:latest docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION} -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base docker push vllm/vllm-openai-rocm:latest-base diff --git a/.buildkite/scripts/annotate-rocm-release.sh b/.buildkite/scripts/annotate-rocm-release.sh index 8a5b34440..d66129722 100755 --- a/.buildkite/scripts/annotate-rocm-release.sh +++ b/.buildkite/scripts/annotate-rocm-release.sh @@ -5,20 +5,21 @@ # Generate Buildkite annotation for ROCm wheel release set -ex -# Get build configuration from meta-data +# Extract build configuration from Dockerfile.rocm_base (single source of truth) # Extract ROCm version dynamically from Dockerfile.rocm_base # BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0" ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown") -PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12") -PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151") +PYTHON_VERSION=$(grep '^ARG PYTHON_VERSION=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTHON_VERSION=//') +PYTORCH_ROCM_ARCH=$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//') -# TODO: Enable the nightly build for ROCm # Get release version, default to 1.0.0.dev for nightly/per-commit builds RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "") if [ -z "${RELEASE_VERSION}" ]; then RELEASE_VERSION="1.0.0.dev" fi +ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key) + # S3 URLs S3_BUCKET="${S3_BUCKET:-vllm-wheels}" S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}" @@ -96,7 +97,7 @@ To download and upload the image: docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm -docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base docker push vllm/vllm-openai-rocm:latest-base diff --git a/.buildkite/scripts/cache-rocm-base-wheels.sh b/.buildkite/scripts/cache-rocm-base-wheels.sh index 060d09db4..dc76544d3 100755 --- a/.buildkite/scripts/cache-rocm-base-wheels.sh +++ b/.buildkite/scripts/cache-rocm-base-wheels.sh @@ -15,8 +15,6 @@ # # Environment variables: # S3_BUCKET - S3 bucket name (default: vllm-wheels) -# PYTHON_VERSION - Python version (affects cache key) -# PYTORCH_ROCM_ARCH - GPU architectures (affects cache key) # # Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base, # so changes to ROCm version are captured by the Dockerfile hash. @@ -36,13 +34,7 @@ generate_cache_key() { fi local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16) - # Include key build args that affect the output - # These should match the ARGs in Dockerfile.rocm_base that change the build output - # Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash - local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}" - local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8) - - echo "${dockerfile_hash}-${args_hash}" + echo "${dockerfile_hash}" } CACHE_KEY=$(generate_cache_key) @@ -52,9 +44,6 @@ case "${1:-}" in check) echo "Checking cache for key: ${CACHE_KEY}" >&2 echo "Cache path: ${CACHE_PATH}" >&2 - echo "Variables used in cache key:" >&2 - echo " PYTHON_VERSION: ${PYTHON_VERSION:-}" >&2 - echo " PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-}" >&2 # Check if cache exists by listing objects # We look for at least one .whl file @@ -104,14 +93,16 @@ case "${1:-}" in echo "Cache key: ${CACHE_KEY}" echo "Cache path: ${CACHE_PATH}" echo "" - mkdir -p artifacts/rocm-base-wheels - aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/ - + + # Use sync with include/exclude to only download .whl files + aws s3 sync "${CACHE_PATH}" artifacts/rocm-base-wheels/ \ + --exclude "*" \ + --include "*.whl" + echo "" echo "Downloaded wheels:" find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \; - WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l) echo "" echo "Total: $WHEEL_COUNT wheels" diff --git a/.buildkite/scripts/cleanup-nightly-builds.sh b/.buildkite/scripts/cleanup-nightly-builds.sh index 9e015e19f..85bd573a5 100755 --- a/.buildkite/scripts/cleanup-nightly-builds.sh +++ b/.buildkite/scripts/cleanup-nightly-builds.sh @@ -4,16 +4,19 @@ set -ex # Clean up old nightly builds from DockerHub, keeping only the last 14 builds # This script uses DockerHub API to list and delete old tags with specified prefix -# Usage: cleanup-nightly-builds.sh [TAG_PREFIX] -# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-" +# Usage: cleanup-nightly-builds.sh [TAG_PREFIX] [REPO] +# Example: cleanup-nightly-builds.sh "nightly-" +# Example: cleanup-nightly-builds.sh "cu130-nightly-" +# Example: cleanup-nightly-builds.sh "nightly-" "vllm/vllm-openai-rocm" -# Get tag prefix from argument, default to "nightly-" if not provided +# Get tag prefix and repo from arguments TAG_PREFIX="${1:-nightly-}" +REPO="${2:-vllm/vllm-openai}" -echo "Cleaning up tags with prefix: $TAG_PREFIX" +echo "Cleaning up tags with prefix: $TAG_PREFIX in repository: $REPO" -# DockerHub API endpoint for vllm/vllm-openai repository -REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags" +# DockerHub API endpoint for the repository +REPO_API_URL="https://hub.docker.com/v2/repositories/${REPO}/tags" # Get DockerHub credentials from environment if [ -z "$DOCKERHUB_TOKEN" ]; then @@ -70,7 +73,7 @@ delete_tag() { local tag_name="$1" echo "Deleting tag: $tag_name" - local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name" + local delete_url="https://hub.docker.com/v2/repositories/${REPO}/tags/$tag_name" set +x local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url") set -x diff --git a/.buildkite/scripts/push-nightly-builds-rocm.sh b/.buildkite/scripts/push-nightly-builds-rocm.sh new file mode 100644 index 000000000..07577f8fd --- /dev/null +++ b/.buildkite/scripts/push-nightly-builds-rocm.sh @@ -0,0 +1,62 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Push ROCm nightly base image and nightly image from ECR +# to Docker Hub as vllm/vllm-openai-rocm:base-nightly and vllm/vllm-openai-rocm:nightly +# and vllm/vllm-openai-rocm:base-nightly- and vllm/vllm-openai-rocm:nightly-. +# Run when NIGHTLY=1 after build-rocm-release-image has pushed to ECR. +# +# Local testing (no push to Docker Hub): +# BUILDKITE_COMMIT= DRY_RUN=1 bash .buildkite/scripts/push-nightly-builds-rocm.sh +# Requires: AWS CLI configured (for ECR public login), Docker. For full run: Docker Hub login. + +set -ex + +# Use BUILDKITE_COMMIT from env (required; set to a commit that has ROCm image in ECR for local test) +BUILDKITE_COMMIT="${BUILDKITE_COMMIT:?Set BUILDKITE_COMMIT to the commit SHA that has the ROCm image in ECR (e.g. from a previous release pipeline run)}" +DRY_RUN="${DRY_RUN:-0}" + +# Get the base image ECR tag (set by build-rocm-release-image pipeline step) +BASE_ORIG_TAG="$(buildkite-agent meta-data get rocm-base-ecr-tag 2>/dev/null || echo "")" +if [ -z "$BASE_ORIG_TAG" ]; then + echo "WARNING: rocm-base-ecr-tag metadata not found, falling back to commit-based tag" + BASE_ORIG_TAG="public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base" +fi + +ORIG_TAG="${BUILDKITE_COMMIT}-rocm" +BASE_TAG_NAME="base-nightly" +TAG_NAME="nightly" +BASE_TAG_NAME_COMMIT="base-nightly-${BUILDKITE_COMMIT}" +TAG_NAME_COMMIT="nightly-${BUILDKITE_COMMIT}" + +echo "Pushing ROCm base image from ECR: $BASE_ORIG_TAG" +echo "Pushing ROCm release image from ECR tag: $ORIG_TAG to Docker Hub as $TAG_NAME and $TAG_NAME_COMMIT" +[[ "$DRY_RUN" == "1" ]] && echo "[DRY_RUN] Skipping push to Docker Hub" + +# Login to ECR and pull the image built by build-rocm-release-image +aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7 +docker pull "$BASE_ORIG_TAG" +docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG" + +# Tag for Docker Hub (base-nightly and base-nightly-, nightly and nightly-) +docker tag "$BASE_ORIG_TAG" vllm/vllm-openai-rocm:"$BASE_TAG_NAME" +docker tag "$BASE_ORIG_TAG" vllm/vllm-openai-rocm:"$BASE_TAG_NAME_COMMIT" +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG" vllm/vllm-openai-rocm:"$TAG_NAME" +docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG" vllm/vllm-openai-rocm:"$TAG_NAME_COMMIT" + +if [[ "$DRY_RUN" == "1" ]]; then + echo "[DRY_RUN] Would push vllm/vllm-openai-rocm:$BASE_TAG_NAME and vllm/vllm-openai-rocm:$BASE_TAG_NAME_COMMIT" + echo "[DRY_RUN] Would push vllm/vllm-openai-rocm:$TAG_NAME and vllm/vllm-openai-rocm:$TAG_NAME_COMMIT" + echo "[DRY_RUN] Local tags created. Exiting without push." + exit 0 +fi + +# Push to Docker Hub (docker-login plugin runs before this step in CI) +docker push vllm/vllm-openai-rocm:"$BASE_TAG_NAME" +docker push vllm/vllm-openai-rocm:"$BASE_TAG_NAME_COMMIT" +docker push vllm/vllm-openai-rocm:"$TAG_NAME" +docker push vllm/vllm-openai-rocm:"$TAG_NAME_COMMIT" + +echo "Pushed vllm/vllm-openai-rocm:$BASE_TAG_NAME and vllm/vllm-openai-rocm:$BASE_TAG_NAME_COMMIT" +echo "Pushed vllm/vllm-openai-rocm:$TAG_NAME and vllm/vllm-openai-rocm:$TAG_NAME_COMMIT"