[Releases] [ROCm] Enable Nightly Docker Image and Wheel Releases for ROCm (#37283)
Signed-off-by: tjtanaa <tunjian.tan@embeddedllm.com> Co-authored-by: Hongxia Yang <hongxiay.yang@amd.com>
This commit is contained in:
@@ -342,184 +342,143 @@ steps:
|
||||
# To build a specific version, trigger the build from that branch/tag.
|
||||
#
|
||||
# Environment variables for ROCm builds (set via Buildkite UI or schedule):
|
||||
# ROCM_PYTHON_VERSION: Python version (default: 3.12)
|
||||
# PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151)
|
||||
# ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases)
|
||||
# ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false)
|
||||
#
|
||||
# Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
|
||||
# (currently rocm/dev-ubuntu-22.04:7.1-complete)
|
||||
#
|
||||
# =============================================================================
|
||||
|
||||
# ROCm Input Step - Collect build configuration (manual trigger only)
|
||||
- input: "ROCm Wheel Release Build Configuration"
|
||||
key: input-rocm-config
|
||||
depends_on: ~
|
||||
if: build.source == "ui"
|
||||
fields:
|
||||
- text: "Python Version"
|
||||
key: "rocm-python-version"
|
||||
default: "3.12"
|
||||
hint: "Python version (e.g., 3.12)"
|
||||
- text: "GPU Architectures"
|
||||
key: "rocm-pytorch-rocm-arch"
|
||||
default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
|
||||
hint: "Semicolon-separated GPU architectures"
|
||||
- select: "Upload Wheels to S3"
|
||||
key: "rocm-upload-wheels"
|
||||
default: "true"
|
||||
options:
|
||||
- label: "No - Build only (nightly/dev)"
|
||||
value: "false"
|
||||
- label: "Yes - Upload to S3 (release)"
|
||||
value: "true"
|
||||
- select: "Force Rebuild Base Wheels"
|
||||
key: "rocm-force-rebuild"
|
||||
default: "false"
|
||||
hint: "Ignore S3 cache and rebuild base wheels from scratch"
|
||||
options:
|
||||
- label: "No - Use cached wheels if available"
|
||||
value: "false"
|
||||
- label: "Yes - Rebuild even if cache exists"
|
||||
value: "true"
|
||||
|
||||
# ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
|
||||
- label: ":rocm: Build ROCm Base Wheels"
|
||||
- label: ":rocm: Build ROCm Base Image & Wheels"
|
||||
id: build-rocm-base-wheels
|
||||
depends_on:
|
||||
- step: input-rocm-config
|
||||
allow_failure: true # Allow failure so non-UI builds can proceed (input step is skipped)
|
||||
depends_on: ~
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
# Set configuration and check cache
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Get values from meta-data (set by input step) or use defaults
|
||||
PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')"
|
||||
export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}"
|
||||
|
||||
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
||||
export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
||||
|
||||
# Check for force rebuild flag
|
||||
ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}"
|
||||
if [ -z "$${ROCM_FORCE_REBUILD}" ]; then
|
||||
ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')"
|
||||
fi
|
||||
|
||||
echo "========================================"
|
||||
echo "ROCm Base Wheels Build Configuration"
|
||||
echo "========================================"
|
||||
echo " PYTHON_VERSION: $${PYTHON_VERSION}"
|
||||
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
|
||||
echo " ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}"
|
||||
echo "========================================"
|
||||
|
||||
# Save resolved config for later jobs
|
||||
buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}"
|
||||
buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}"
|
||||
|
||||
# Check S3 cache for pre-built wheels
|
||||
# Generate cache key
|
||||
CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
|
||||
CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path)
|
||||
echo ""
|
||||
echo "Cache key: $${CACHE_KEY}"
|
||||
echo "Cache path: $${CACHE_PATH}"
|
||||
ECR_CACHE_TAG="public.ecr.aws/q9t5s3a7/vllm-release-repo:$${CACHE_KEY}-rocm-base"
|
||||
|
||||
# Save cache key for downstream jobs
|
||||
buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}"
|
||||
echo "========================================"
|
||||
echo "ROCm Base Build Configuration"
|
||||
echo "========================================"
|
||||
echo " CACHE_KEY: $${CACHE_KEY}"
|
||||
echo " ECR_CACHE_TAG: $${ECR_CACHE_TAG}"
|
||||
echo "========================================"
|
||||
|
||||
# Login to ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | \
|
||||
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
|
||||
IMAGE_EXISTS=false
|
||||
WHEELS_EXIST=false
|
||||
|
||||
# Check ECR for Docker image
|
||||
|
||||
CACHE_STATUS="miss"
|
||||
if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then
|
||||
CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
|
||||
else
|
||||
echo "Force rebuild requested, skipping cache check"
|
||||
if docker manifest inspect "$${ECR_CACHE_TAG}" > /dev/null 2>&1; then
|
||||
IMAGE_EXISTS=true
|
||||
echo "ECR image cache HIT"
|
||||
fi
|
||||
|
||||
# Check S3 for wheels
|
||||
WHEEL_CACHE_STATUS=$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
|
||||
if [ "$${WHEEL_CACHE_STATUS}" = "hit" ]; then
|
||||
WHEELS_EXIST=true
|
||||
echo "S3 wheels cache HIT"
|
||||
fi
|
||||
|
||||
if [ "$${CACHE_STATUS}" = "hit" ]; then
|
||||
|
||||
# Scenario 1: Both cached (best case)
|
||||
if [ "$${IMAGE_EXISTS}" = "true" ] && [ "$${WHEELS_EXIST}" = "true" ]; then
|
||||
echo ""
|
||||
echo "CACHE HIT! Downloading pre-built wheels..."
|
||||
echo "FULL CACHE HIT - Reusing both image and wheels"
|
||||
echo ""
|
||||
|
||||
# Download wheels
|
||||
.buildkite/scripts/cache-rocm-base-wheels.sh download
|
||||
|
||||
# Set the S3 path for the cached Docker image (for Job 2 to download)
|
||||
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
|
||||
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||
|
||||
# Mark that we used cache (for Docker image handling)
|
||||
buildkite-agent meta-data set "rocm-used-cache" "true"
|
||||
|
||||
|
||||
# Save ECR tag for downstream jobs
|
||||
buildkite-agent meta-data set "rocm-base-image-tag" "$${ECR_CACHE_TAG}"
|
||||
|
||||
# Scenario 2: Image cached but wheels missing
|
||||
elif [ "$${IMAGE_EXISTS}" = "true" ] && [ "$${WHEELS_EXIST}" = "false" ]; then
|
||||
echo ""
|
||||
echo "Cache download complete. Skipping Docker build."
|
||||
echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||
else
|
||||
echo "PARTIAL CACHE - Image exists, extracting wheels..."
|
||||
echo ""
|
||||
echo "CACHE MISS. Building from scratch..."
|
||||
echo ""
|
||||
|
||||
# Build full base image (for later vLLM build)
|
||||
DOCKER_BUILDKIT=1 docker buildx build \
|
||||
--file docker/Dockerfile.rocm_base \
|
||||
--tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \
|
||||
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
--load \
|
||||
.
|
||||
|
||||
# Build debs_wheel_release stage for wheel extraction
|
||||
|
||||
docker pull "$${ECR_CACHE_TAG}"
|
||||
|
||||
# Rebuild wheel extraction stage
|
||||
DOCKER_BUILDKIT=1 docker buildx build \
|
||||
--file docker/Dockerfile.rocm_base \
|
||||
--tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
|
||||
--build-arg BASE_IMAGE="$${ECR_IMAGE_TAG}" \
|
||||
--target debs_wheel_release \
|
||||
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
--load \
|
||||
.
|
||||
|
||||
# Extract wheels from Docker image
|
||||
|
||||
# Extract and upload wheels
|
||||
mkdir -p artifacts/rocm-base-wheels
|
||||
container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
|
||||
docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/
|
||||
docker rm $${container_id}
|
||||
echo "Extracted base wheels:"
|
||||
ls -lh artifacts/rocm-base-wheels/
|
||||
|
||||
# Upload wheels to S3 cache for future builds
|
||||
cid=$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
|
||||
docker cp $${cid}:/app/debs/. artifacts/rocm-base-wheels/
|
||||
docker rm $${cid}
|
||||
|
||||
.buildkite/scripts/cache-rocm-base-wheels.sh upload
|
||||
|
||||
buildkite-agent meta-data set "rocm-base-image-tag" "$${ECR_CACHE_TAG}"
|
||||
|
||||
# Scenario 3: Full rebuild needed
|
||||
else
|
||||
echo ""
|
||||
echo "Uploading wheels to S3 cache..."
|
||||
echo " CACHE MISS - Building from scratch..."
|
||||
echo ""
|
||||
|
||||
# Build full base image and push to ECR
|
||||
DOCKER_BUILDKIT=1 docker buildx build \
|
||||
--file docker/Dockerfile.rocm_base \
|
||||
--tag "$${ECR_CACHE_TAG}" \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
--push \
|
||||
.
|
||||
|
||||
# Build wheel extraction stage
|
||||
DOCKER_BUILDKIT=1 docker buildx build \
|
||||
--file docker/Dockerfile.rocm_base \
|
||||
--tag rocm-base-debs:${BUILDKITE_BUILD_NUMBER} \
|
||||
--target debs_wheel_release \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
--load \
|
||||
.
|
||||
|
||||
# Extract and upload wheels
|
||||
mkdir -p artifacts/rocm-base-wheels
|
||||
cid=$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
|
||||
docker cp $${cid}:/app/debs/. artifacts/rocm-base-wheels/
|
||||
docker rm $${cid}
|
||||
|
||||
.buildkite/scripts/cache-rocm-base-wheels.sh upload
|
||||
|
||||
# Export base Docker image for reuse in vLLM build
|
||||
mkdir -p artifacts/rocm-docker-image
|
||||
docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
||||
echo "Docker image size:"
|
||||
ls -lh artifacts/rocm-docker-image/
|
||||
|
||||
# Upload large Docker image to S3 (also cached by cache key)
|
||||
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
|
||||
echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/"
|
||||
aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||
|
||||
# Save the S3 path for downstream jobs
|
||||
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||
|
||||
# Mark that we did NOT use cache
|
||||
buildkite-agent meta-data set "rocm-used-cache" "false"
|
||||
|
||||
# Cache base docker image to ECR
|
||||
docker push "$${ECR_CACHE_TAG}"
|
||||
|
||||
buildkite-agent meta-data set "rocm-base-image-tag" "$${ECR_CACHE_TAG}"
|
||||
|
||||
echo ""
|
||||
echo "Build complete. Wheels cached for future builds."
|
||||
echo " Build complete - Image and wheels cached"
|
||||
fi
|
||||
|
||||
artifact_paths:
|
||||
- "artifacts/rocm-base-wheels/*.whl"
|
||||
env:
|
||||
@@ -563,31 +522,25 @@ steps:
|
||||
echo "Downloading wheel artifacts from current build"
|
||||
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
|
||||
|
||||
# Download Docker image from S3 (too large for Buildkite artifacts)
|
||||
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
|
||||
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
|
||||
echo "ERROR: rocm-docker-image-s3-path metadata not found"
|
||||
# Get ECR image tag from metadata (set by build-rocm-base-wheels)
|
||||
ECR_IMAGE_TAG="$$(buildkite-agent meta-data get rocm-base-image-tag 2>/dev/null || echo '')"
|
||||
if [ -z "$${ECR_IMAGE_TAG}" ]; then
|
||||
echo "ERROR: rocm-base-image-tag metadata not found"
|
||||
echo "This should have been set by the build-rocm-base-wheels job"
|
||||
exit 1
|
||||
fi
|
||||
echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}"
|
||||
mkdir -p artifacts/rocm-docker-image
|
||||
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
||||
|
||||
# Load base Docker image and capture the tag
|
||||
echo "Loading base Docker image..."
|
||||
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
|
||||
echo "$${LOAD_OUTPUT}"
|
||||
# Extract the actual loaded image tag from "Loaded image: <tag>" output
|
||||
# This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent
|
||||
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
|
||||
if [ -z "$${BASE_IMAGE_TAG}" ]; then
|
||||
echo "ERROR: Failed to extract image tag from docker load output"
|
||||
echo "Load output was: $${LOAD_OUTPUT}"
|
||||
exit 1
|
||||
fi
|
||||
echo "Loaded base image: $${BASE_IMAGE_TAG}"
|
||||
|
||||
|
||||
echo "Pulling base Docker image from ECR: $${ECR_IMAGE_TAG}"
|
||||
|
||||
# Login to ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | \
|
||||
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
|
||||
# Pull base Docker image from ECR
|
||||
docker pull "$${ECR_IMAGE_TAG}"
|
||||
|
||||
echo "Loaded base image: $${ECR_IMAGE_TAG}"
|
||||
|
||||
# Prepare base wheels for Docker build context
|
||||
mkdir -p docker/context/base-wheels
|
||||
touch docker/context/base-wheels/.keep
|
||||
@@ -595,16 +548,11 @@ steps:
|
||||
echo "Base wheels for vLLM build:"
|
||||
ls -lh docker/context/base-wheels/
|
||||
|
||||
# Get GPU architectures from meta-data
|
||||
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
||||
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
||||
|
||||
echo "========================================"
|
||||
echo "Building vLLM wheel with:"
|
||||
echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
|
||||
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
|
||||
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
|
||||
echo " BASE_IMAGE: $${BASE_IMAGE_TAG}"
|
||||
echo " BASE_IMAGE: $${ECR_IMAGE_TAG}"
|
||||
echo "========================================"
|
||||
|
||||
# Build vLLM wheel using local checkout (REMOTE_VLLM=0)
|
||||
@@ -612,8 +560,7 @@ steps:
|
||||
--file docker/Dockerfile.rocm \
|
||||
--target export_vllm_wheel_release \
|
||||
--output type=local,dest=rocm-dist \
|
||||
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
|
||||
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg BASE_IMAGE="$${ECR_IMAGE_TAG}" \
|
||||
--build-arg REMOTE_VLLM=0 \
|
||||
--build-arg GIT_REPO_CHECK=1 \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
@@ -621,10 +568,8 @@ steps:
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
.
|
||||
|
||||
echo "Built vLLM wheel:"
|
||||
ls -lh rocm-dist/*.whl
|
||||
|
||||
# Copy wheel to artifacts directory
|
||||
mkdir -p artifacts/rocm-vllm-wheel
|
||||
cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
|
||||
@@ -650,28 +595,6 @@ steps:
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Check if upload is enabled (from env var, meta-data, or release branch)
|
||||
ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}"
|
||||
if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then
|
||||
# Try to get from meta-data (input form)
|
||||
ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')"
|
||||
fi
|
||||
|
||||
echo "========================================"
|
||||
echo "Upload check:"
|
||||
echo " ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}"
|
||||
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
|
||||
echo "========================================"
|
||||
|
||||
# Skip upload if not enabled
|
||||
if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then
|
||||
echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)"
|
||||
echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Upload enabled, proceeding..."
|
||||
|
||||
# Download artifacts from current build
|
||||
echo "Downloading artifacts from current build"
|
||||
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
|
||||
@@ -687,10 +610,7 @@ steps:
|
||||
- label: ":memo: Annotate ROCm wheel release"
|
||||
id: annotate-rocm-release
|
||||
depends_on:
|
||||
- step: upload-rocm-wheels
|
||||
allow_failure: true
|
||||
- step: input-release-version
|
||||
allow_failure: true
|
||||
- upload-rocm-wheels
|
||||
agents:
|
||||
queue: cpu_queue_release
|
||||
commands:
|
||||
@@ -716,7 +636,7 @@ steps:
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
VARIANT: "rocm700"
|
||||
|
||||
# ROCm Job 5: Build ROCm Release Docker Image
|
||||
# ROCm Job 6: Build ROCm Release Docker Image
|
||||
- label: ":docker: Build release image - x86_64 - ROCm"
|
||||
id: build-rocm-release-image
|
||||
depends_on:
|
||||
@@ -728,42 +648,39 @@ steps:
|
||||
commands:
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
|
||||
# Login to ECR
|
||||
aws ecr-public get-login-password --region us-east-1 | \
|
||||
docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
|
||||
# Download Docker image from S3 (set by build-rocm-base-wheels)
|
||||
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
|
||||
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
|
||||
echo "ERROR: rocm-docker-image-s3-path metadata not found"
|
||||
|
||||
# Get ECR image tag from metadata (set by build-rocm-base-wheels)
|
||||
ECR_IMAGE_TAG="$$(buildkite-agent meta-data get rocm-base-image-tag 2>/dev/null || echo '')"
|
||||
if [ -z "$${ECR_IMAGE_TAG}" ]; then
|
||||
echo "ERROR: rocm-base-image-tag metadata not found"
|
||||
echo "This should have been set by the build-rocm-base-wheels job"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Downloading base image from $${DOCKER_IMAGE_S3_PATH}"
|
||||
mkdir -p artifacts/rocm-docker-image
|
||||
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
||||
|
||||
# Load base Docker image
|
||||
echo "Loading base Docker image..."
|
||||
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
|
||||
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
|
||||
echo "Loaded base image: $${BASE_IMAGE_TAG}"
|
||||
|
||||
# Tag and push the base image to ECR
|
||||
docker tag "$${BASE_IMAGE_TAG}" public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
|
||||
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base
|
||||
echo "Pushed base image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm-base"
|
||||
|
||||
# Get GPU architectures from meta-data
|
||||
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
||||
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
||||
|
||||
|
||||
echo "Pulling base Docker image from ECR: $${ECR_IMAGE_TAG}"
|
||||
|
||||
# Pull base Docker image from ECR
|
||||
docker pull "$${ECR_IMAGE_TAG}"
|
||||
|
||||
echo "Loaded base image: $${ECR_IMAGE_TAG}"
|
||||
|
||||
# Pass the base image ECR tag to downstream steps (nightly publish)
|
||||
buildkite-agent meta-data set "rocm-base-ecr-tag" "$${ECR_IMAGE_TAG}"
|
||||
|
||||
echo "========================================"
|
||||
echo "Building vLLM ROCm release image with:"
|
||||
echo " BASE_IMAGE: $${ECR_IMAGE_TAG}"
|
||||
echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
|
||||
echo "========================================"
|
||||
|
||||
# Build vLLM ROCm release image using cached base
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
--build-arg max_jobs=16 \
|
||||
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
|
||||
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg BASE_IMAGE="$${ECR_IMAGE_TAG}" \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
@@ -772,10 +689,32 @@ steps:
|
||||
--target vllm-openai \
|
||||
--progress plain \
|
||||
-f docker/Dockerfile.rocm .
|
||||
|
||||
|
||||
# Push to ECR
|
||||
docker push public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm
|
||||
echo "Pushed: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
|
||||
|
||||
echo ""
|
||||
echo " Successfully built and pushed ROCm release image"
|
||||
echo " Image: public.ecr.aws/q9t5s3a7/vllm-release-repo:$${BUILDKITE_COMMIT}-rocm"
|
||||
echo ""
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
- label: "Publish nightly ROCm image to DockerHub"
|
||||
depends_on:
|
||||
- build-rocm-release-image
|
||||
agents:
|
||||
queue: small_cpu_queue_release
|
||||
commands:
|
||||
- "bash .buildkite/scripts/push-nightly-builds-rocm.sh"
|
||||
# Clean up old nightly builds (keep only last 14)
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh nightly- vllm/vllm-openai-rocm"
|
||||
- "bash .buildkite/scripts/cleanup-nightly-builds.sh base-nightly- vllm/vllm-openai-rocm"
|
||||
plugins:
|
||||
- docker-login#v3.0.0:
|
||||
username: vllmbot
|
||||
password-env: DOCKERHUB_TOKEN
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
@@ -8,6 +8,8 @@ if [ -z "${RELEASE_VERSION}" ]; then
|
||||
RELEASE_VERSION="1.0.0.dev"
|
||||
fi
|
||||
|
||||
ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
|
||||
|
||||
buildkite-agent annotate --style 'info' --context 'release-workflow' << EOF
|
||||
To download the wheel (by commit):
|
||||
\`\`\`
|
||||
@@ -33,7 +35,7 @@ docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-x86_64-cu130
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-aarch64-cu130
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-cpu-release-repo:v${RELEASE_VERSION}
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-arm64-cpu-release-repo:v${RELEASE_VERSION}
|
||||
@@ -74,7 +76,7 @@ docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT} vllm/vllm-openai-rocm:v${RE
|
||||
docker push vllm/vllm-openai-rocm:latest
|
||||
docker push vllm/vllm-openai-rocm:v${RELEASE_VERSION}
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
docker push vllm/vllm-openai-rocm:latest-base
|
||||
|
||||
@@ -5,20 +5,21 @@
|
||||
# Generate Buildkite annotation for ROCm wheel release
|
||||
set -ex
|
||||
|
||||
# Get build configuration from meta-data
|
||||
# Extract build configuration from Dockerfile.rocm_base (single source of truth)
|
||||
# Extract ROCm version dynamically from Dockerfile.rocm_base
|
||||
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.0-complete -> extracts "7.0"
|
||||
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
|
||||
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
|
||||
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||
PYTHON_VERSION=$(grep '^ARG PYTHON_VERSION=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTHON_VERSION=//')
|
||||
PYTORCH_ROCM_ARCH=$(grep '^ARG PYTORCH_ROCM_ARCH=' docker/Dockerfile.rocm_base | sed 's/^ARG PYTORCH_ROCM_ARCH=//')
|
||||
|
||||
# TODO: Enable the nightly build for ROCm
|
||||
# Get release version, default to 1.0.0.dev for nightly/per-commit builds
|
||||
RELEASE_VERSION=$(buildkite-agent meta-data get release-version 2>/dev/null || echo "")
|
||||
if [ -z "${RELEASE_VERSION}" ]; then
|
||||
RELEASE_VERSION="1.0.0.dev"
|
||||
fi
|
||||
|
||||
ROCM_BASE_CACHE_KEY=$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
|
||||
|
||||
# S3 URLs
|
||||
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
|
||||
@@ -96,7 +97,7 @@ To download and upload the image:
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm
|
||||
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:${ROCM_BASE_CACHE_KEY}-rocm-base vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:latest-base
|
||||
docker tag vllm/vllm-openai-rocm:${BUILDKITE_COMMIT}-base vllm/vllm-openai-rocm:v${RELEASE_VERSION}-base
|
||||
docker push vllm/vllm-openai-rocm:latest-base
|
||||
|
||||
@@ -15,8 +15,6 @@
|
||||
#
|
||||
# Environment variables:
|
||||
# S3_BUCKET - S3 bucket name (default: vllm-wheels)
|
||||
# PYTHON_VERSION - Python version (affects cache key)
|
||||
# PYTORCH_ROCM_ARCH - GPU architectures (affects cache key)
|
||||
#
|
||||
# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
|
||||
# so changes to ROCm version are captured by the Dockerfile hash.
|
||||
@@ -36,13 +34,7 @@ generate_cache_key() {
|
||||
fi
|
||||
local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
|
||||
|
||||
# Include key build args that affect the output
|
||||
# These should match the ARGs in Dockerfile.rocm_base that change the build output
|
||||
# Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash
|
||||
local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}"
|
||||
local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
|
||||
|
||||
echo "${dockerfile_hash}-${args_hash}"
|
||||
echo "${dockerfile_hash}"
|
||||
}
|
||||
|
||||
CACHE_KEY=$(generate_cache_key)
|
||||
@@ -52,9 +44,6 @@ case "${1:-}" in
|
||||
check)
|
||||
echo "Checking cache for key: ${CACHE_KEY}" >&2
|
||||
echo "Cache path: ${CACHE_PATH}" >&2
|
||||
echo "Variables used in cache key:" >&2
|
||||
echo " PYTHON_VERSION: ${PYTHON_VERSION:-<not set>}" >&2
|
||||
echo " PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-<not set>}" >&2
|
||||
|
||||
# Check if cache exists by listing objects
|
||||
# We look for at least one .whl file
|
||||
@@ -104,14 +93,16 @@ case "${1:-}" in
|
||||
echo "Cache key: ${CACHE_KEY}"
|
||||
echo "Cache path: ${CACHE_PATH}"
|
||||
echo ""
|
||||
|
||||
mkdir -p artifacts/rocm-base-wheels
|
||||
aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/
|
||||
|
||||
|
||||
# Use sync with include/exclude to only download .whl files
|
||||
aws s3 sync "${CACHE_PATH}" artifacts/rocm-base-wheels/ \
|
||||
--exclude "*" \
|
||||
--include "*.whl"
|
||||
|
||||
echo ""
|
||||
echo "Downloaded wheels:"
|
||||
find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' -exec ls -lh {} \;
|
||||
|
||||
WHEEL_COUNT=$(find artifacts/rocm-base-wheels -maxdepth 1 -name '*.whl' 2>/dev/null | wc -l)
|
||||
echo ""
|
||||
echo "Total: $WHEEL_COUNT wheels"
|
||||
|
||||
@@ -4,16 +4,19 @@ set -ex
|
||||
|
||||
# Clean up old nightly builds from DockerHub, keeping only the last 14 builds
|
||||
# This script uses DockerHub API to list and delete old tags with specified prefix
|
||||
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX]
|
||||
# Example: cleanup-nightly-builds.sh "nightly-" or cleanup-nightly-builds.sh "cu130-nightly-"
|
||||
# Usage: cleanup-nightly-builds.sh [TAG_PREFIX] [REPO]
|
||||
# Example: cleanup-nightly-builds.sh "nightly-"
|
||||
# Example: cleanup-nightly-builds.sh "cu130-nightly-"
|
||||
# Example: cleanup-nightly-builds.sh "nightly-" "vllm/vllm-openai-rocm"
|
||||
|
||||
# Get tag prefix from argument, default to "nightly-" if not provided
|
||||
# Get tag prefix and repo from arguments
|
||||
TAG_PREFIX="${1:-nightly-}"
|
||||
REPO="${2:-vllm/vllm-openai}"
|
||||
|
||||
echo "Cleaning up tags with prefix: $TAG_PREFIX"
|
||||
echo "Cleaning up tags with prefix: $TAG_PREFIX in repository: $REPO"
|
||||
|
||||
# DockerHub API endpoint for vllm/vllm-openai repository
|
||||
REPO_API_URL="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags"
|
||||
# DockerHub API endpoint for the repository
|
||||
REPO_API_URL="https://hub.docker.com/v2/repositories/${REPO}/tags"
|
||||
|
||||
# Get DockerHub credentials from environment
|
||||
if [ -z "$DOCKERHUB_TOKEN" ]; then
|
||||
@@ -70,7 +73,7 @@ delete_tag() {
|
||||
local tag_name="$1"
|
||||
echo "Deleting tag: $tag_name"
|
||||
|
||||
local delete_url="https://hub.docker.com/v2/repositories/vllm/vllm-openai/tags/$tag_name"
|
||||
local delete_url="https://hub.docker.com/v2/repositories/${REPO}/tags/$tag_name"
|
||||
set +x
|
||||
local response=$(curl -s -X DELETE -H "Authorization: Bearer $BEARER_TOKEN" "$delete_url")
|
||||
set -x
|
||||
|
||||
62
.buildkite/scripts/push-nightly-builds-rocm.sh
Normal file
62
.buildkite/scripts/push-nightly-builds-rocm.sh
Normal file
@@ -0,0 +1,62 @@
|
||||
#!/bin/bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Push ROCm nightly base image and nightly image from ECR
|
||||
# to Docker Hub as vllm/vllm-openai-rocm:base-nightly and vllm/vllm-openai-rocm:nightly
|
||||
# and vllm/vllm-openai-rocm:base-nightly-<commit> and vllm/vllm-openai-rocm:nightly-<commit>.
|
||||
# Run when NIGHTLY=1 after build-rocm-release-image has pushed to ECR.
|
||||
#
|
||||
# Local testing (no push to Docker Hub):
|
||||
# BUILDKITE_COMMIT=<commit-with-rocm-image-in-ecr> DRY_RUN=1 bash .buildkite/scripts/push-nightly-builds-rocm.sh
|
||||
# Requires: AWS CLI configured (for ECR public login), Docker. For full run: Docker Hub login.
|
||||
|
||||
set -ex
|
||||
|
||||
# Use BUILDKITE_COMMIT from env (required; set to a commit that has ROCm image in ECR for local test)
|
||||
BUILDKITE_COMMIT="${BUILDKITE_COMMIT:?Set BUILDKITE_COMMIT to the commit SHA that has the ROCm image in ECR (e.g. from a previous release pipeline run)}"
|
||||
DRY_RUN="${DRY_RUN:-0}"
|
||||
|
||||
# Get the base image ECR tag (set by build-rocm-release-image pipeline step)
|
||||
BASE_ORIG_TAG="$(buildkite-agent meta-data get rocm-base-ecr-tag 2>/dev/null || echo "")"
|
||||
if [ -z "$BASE_ORIG_TAG" ]; then
|
||||
echo "WARNING: rocm-base-ecr-tag metadata not found, falling back to commit-based tag"
|
||||
BASE_ORIG_TAG="public.ecr.aws/q9t5s3a7/vllm-release-repo:${BUILDKITE_COMMIT}-rocm-base"
|
||||
fi
|
||||
|
||||
ORIG_TAG="${BUILDKITE_COMMIT}-rocm"
|
||||
BASE_TAG_NAME="base-nightly"
|
||||
TAG_NAME="nightly"
|
||||
BASE_TAG_NAME_COMMIT="base-nightly-${BUILDKITE_COMMIT}"
|
||||
TAG_NAME_COMMIT="nightly-${BUILDKITE_COMMIT}"
|
||||
|
||||
echo "Pushing ROCm base image from ECR: $BASE_ORIG_TAG"
|
||||
echo "Pushing ROCm release image from ECR tag: $ORIG_TAG to Docker Hub as $TAG_NAME and $TAG_NAME_COMMIT"
|
||||
[[ "$DRY_RUN" == "1" ]] && echo "[DRY_RUN] Skipping push to Docker Hub"
|
||||
|
||||
# Login to ECR and pull the image built by build-rocm-release-image
|
||||
aws ecr-public get-login-password --region us-east-1 | docker login --username AWS --password-stdin public.ecr.aws/q9t5s3a7
|
||||
docker pull "$BASE_ORIG_TAG"
|
||||
docker pull public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG"
|
||||
|
||||
# Tag for Docker Hub (base-nightly and base-nightly-<commit>, nightly and nightly-<commit>)
|
||||
docker tag "$BASE_ORIG_TAG" vllm/vllm-openai-rocm:"$BASE_TAG_NAME"
|
||||
docker tag "$BASE_ORIG_TAG" vllm/vllm-openai-rocm:"$BASE_TAG_NAME_COMMIT"
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG" vllm/vllm-openai-rocm:"$TAG_NAME"
|
||||
docker tag public.ecr.aws/q9t5s3a7/vllm-release-repo:"$ORIG_TAG" vllm/vllm-openai-rocm:"$TAG_NAME_COMMIT"
|
||||
|
||||
if [[ "$DRY_RUN" == "1" ]]; then
|
||||
echo "[DRY_RUN] Would push vllm/vllm-openai-rocm:$BASE_TAG_NAME and vllm/vllm-openai-rocm:$BASE_TAG_NAME_COMMIT"
|
||||
echo "[DRY_RUN] Would push vllm/vllm-openai-rocm:$TAG_NAME and vllm/vllm-openai-rocm:$TAG_NAME_COMMIT"
|
||||
echo "[DRY_RUN] Local tags created. Exiting without push."
|
||||
exit 0
|
||||
fi
|
||||
|
||||
# Push to Docker Hub (docker-login plugin runs before this step in CI)
|
||||
docker push vllm/vllm-openai-rocm:"$BASE_TAG_NAME"
|
||||
docker push vllm/vllm-openai-rocm:"$BASE_TAG_NAME_COMMIT"
|
||||
docker push vllm/vllm-openai-rocm:"$TAG_NAME"
|
||||
docker push vllm/vllm-openai-rocm:"$TAG_NAME_COMMIT"
|
||||
|
||||
echo "Pushed vllm/vllm-openai-rocm:$BASE_TAG_NAME and vllm/vllm-openai-rocm:$BASE_TAG_NAME_COMMIT"
|
||||
echo "Pushed vllm/vllm-openai-rocm:$TAG_NAME and vllm/vllm-openai-rocm:$TAG_NAME_COMMIT"
|
||||
Reference in New Issue
Block a user