Cherry pick [ROCm] [CI] [Release] Rocm wheel pipeline with sccache #32264
Signed-off-by: Kevin H. Luu <khluu000@gmail.com>
This commit is contained in:
@@ -214,3 +214,365 @@ steps:
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
DOCKERHUB_USERNAME: "vllmbot"
|
||||
|
||||
# =============================================================================
|
||||
# ROCm Release Pipeline (x86_64 only)
|
||||
# =============================================================================
|
||||
#
|
||||
# vLLM version is determined by the Buildkite checkout (like CUDA pipeline).
|
||||
# To build a specific version, trigger the build from that branch/tag.
|
||||
#
|
||||
# Environment variables for ROCm builds (set via Buildkite UI or schedule):
|
||||
# ROCM_PYTHON_VERSION: Python version (default: 3.12)
|
||||
# PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151)
|
||||
# ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases)
|
||||
# ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false)
|
||||
#
|
||||
# Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base
|
||||
# (currently rocm/dev-ubuntu-22.04:7.1-complete)
|
||||
#
|
||||
# =============================================================================
|
||||
|
||||
# ROCm Input Step - Collect build configuration (manual trigger only)
|
||||
- input: "ROCm Wheel Release Build Configuration"
|
||||
key: input-rocm-config
|
||||
depends_on: ~
|
||||
if: build.source == "ui"
|
||||
fields:
|
||||
- text: "Python Version"
|
||||
key: "rocm-python-version"
|
||||
default: "3.12"
|
||||
hint: "Python version (e.g., 3.12)"
|
||||
- text: "GPU Architectures"
|
||||
key: "rocm-pytorch-rocm-arch"
|
||||
default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151"
|
||||
hint: "Semicolon-separated GPU architectures"
|
||||
- select: "Upload Wheels to S3"
|
||||
key: "rocm-upload-wheels"
|
||||
default: "true"
|
||||
options:
|
||||
- label: "No - Build only (nightly/dev)"
|
||||
value: "false"
|
||||
- label: "Yes - Upload to S3 (release)"
|
||||
value: "true"
|
||||
- select: "Force Rebuild Base Wheels"
|
||||
key: "rocm-force-rebuild"
|
||||
default: "false"
|
||||
hint: "Ignore S3 cache and rebuild base wheels from scratch"
|
||||
options:
|
||||
- label: "No - Use cached wheels if available"
|
||||
value: "false"
|
||||
- label: "Yes - Rebuild even if cache exists"
|
||||
value: "true"
|
||||
|
||||
# ROCm Job 1: Build ROCm Base Wheels (with S3 caching)
|
||||
- label: ":rocm: Build ROCm Base Wheels"
|
||||
id: build-rocm-base-wheels
|
||||
depends_on:
|
||||
- step: input-rocm-config
|
||||
allow_failure: true # Allow failure so non-UI builds can proceed (input step is skipped)
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
# Set configuration and check cache
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Get values from meta-data (set by input step) or use defaults
|
||||
PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')"
|
||||
export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}"
|
||||
|
||||
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
||||
export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
||||
|
||||
# Check for force rebuild flag
|
||||
ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}"
|
||||
if [ -z "$${ROCM_FORCE_REBUILD}" ]; then
|
||||
ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')"
|
||||
fi
|
||||
|
||||
echo "========================================"
|
||||
echo "ROCm Base Wheels Build Configuration"
|
||||
echo "========================================"
|
||||
echo " PYTHON_VERSION: $${PYTHON_VERSION}"
|
||||
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
|
||||
echo " ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}"
|
||||
echo "========================================"
|
||||
|
||||
# Save resolved config for later jobs
|
||||
buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}"
|
||||
buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}"
|
||||
|
||||
# Check S3 cache for pre-built wheels
|
||||
CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key)
|
||||
CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path)
|
||||
echo ""
|
||||
echo "Cache key: $${CACHE_KEY}"
|
||||
echo "Cache path: $${CACHE_PATH}"
|
||||
|
||||
# Save cache key for downstream jobs
|
||||
buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}"
|
||||
|
||||
CACHE_STATUS="miss"
|
||||
if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then
|
||||
CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check)
|
||||
else
|
||||
echo "Force rebuild requested, skipping cache check"
|
||||
fi
|
||||
|
||||
if [ "$${CACHE_STATUS}" = "hit" ]; then
|
||||
echo ""
|
||||
echo "CACHE HIT! Downloading pre-built wheels..."
|
||||
echo ""
|
||||
.buildkite/scripts/cache-rocm-base-wheels.sh download
|
||||
|
||||
# Set the S3 path for the cached Docker image (for Job 2 to download)
|
||||
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
|
||||
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||
|
||||
# Mark that we used cache (for Docker image handling)
|
||||
buildkite-agent meta-data set "rocm-used-cache" "true"
|
||||
|
||||
echo ""
|
||||
echo "Cache download complete. Skipping Docker build."
|
||||
echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||
else
|
||||
echo ""
|
||||
echo "CACHE MISS. Building from scratch..."
|
||||
echo ""
|
||||
|
||||
# Build full base image (for later vLLM build)
|
||||
DOCKER_BUILDKIT=1 docker buildx build \
|
||||
--file docker/Dockerfile.rocm_base \
|
||||
--tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \
|
||||
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
--load \
|
||||
.
|
||||
|
||||
# Build debs_wheel_release stage for wheel extraction
|
||||
DOCKER_BUILDKIT=1 docker buildx build \
|
||||
--file docker/Dockerfile.rocm_base \
|
||||
--tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \
|
||||
--target debs_wheel_release \
|
||||
--build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
--load \
|
||||
.
|
||||
|
||||
# Extract wheels from Docker image
|
||||
mkdir -p artifacts/rocm-base-wheels
|
||||
container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER})
|
||||
docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/
|
||||
docker rm $${container_id}
|
||||
echo "Extracted base wheels:"
|
||||
ls -lh artifacts/rocm-base-wheels/
|
||||
|
||||
# Upload wheels to S3 cache for future builds
|
||||
echo ""
|
||||
echo "Uploading wheels to S3 cache..."
|
||||
.buildkite/scripts/cache-rocm-base-wheels.sh upload
|
||||
|
||||
# Export base Docker image for reuse in vLLM build
|
||||
mkdir -p artifacts/rocm-docker-image
|
||||
docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
||||
echo "Docker image size:"
|
||||
ls -lh artifacts/rocm-docker-image/
|
||||
|
||||
# Upload large Docker image to S3 (also cached by cache key)
|
||||
S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}"
|
||||
echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/"
|
||||
aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||
|
||||
# Save the S3 path for downstream jobs
|
||||
buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz"
|
||||
|
||||
# Mark that we did NOT use cache
|
||||
buildkite-agent meta-data set "rocm-used-cache" "false"
|
||||
|
||||
echo ""
|
||||
echo "Build complete. Wheels cached for future builds."
|
||||
fi
|
||||
artifact_paths:
|
||||
- "artifacts/rocm-base-wheels/*.whl"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
# ROCm Job 2: Build vLLM ROCm Wheel
|
||||
- label: ":python: Build vLLM ROCm Wheel"
|
||||
id: build-rocm-vllm-wheel
|
||||
depends_on:
|
||||
- step: build-rocm-base-wheels
|
||||
allow_failure: false
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
timeout_in_minutes: 180
|
||||
commands:
|
||||
# Download artifacts and prepare Docker image
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags)
|
||||
# This fixes version detection when tags are moved/force-pushed
|
||||
echo "Fetching latest tags from origin..."
|
||||
git fetch --tags --force origin
|
||||
|
||||
# Log tag information for debugging version detection
|
||||
echo "========================================"
|
||||
echo "Git Tag Verification"
|
||||
echo "========================================"
|
||||
echo "Current HEAD: $(git rev-parse HEAD)"
|
||||
echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')"
|
||||
echo ""
|
||||
echo "Recent tags (pointing to commits near HEAD):"
|
||||
git tag -l --sort=-creatordate | head -5
|
||||
echo "setuptools_scm version detection:"
|
||||
pip install -q setuptools_scm 2>/dev/null || true
|
||||
python3 -c "import setuptools_scm; print(' Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo " (setuptools_scm not available in this environment)"
|
||||
echo "========================================"
|
||||
|
||||
# Download wheel artifacts from current build
|
||||
echo "Downloading wheel artifacts from current build"
|
||||
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
|
||||
|
||||
# Download Docker image from S3 (too large for Buildkite artifacts)
|
||||
DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')"
|
||||
if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then
|
||||
echo "ERROR: rocm-docker-image-s3-path metadata not found"
|
||||
echo "This should have been set by the build-rocm-base-wheels job"
|
||||
exit 1
|
||||
fi
|
||||
echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}"
|
||||
mkdir -p artifacts/rocm-docker-image
|
||||
aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz
|
||||
|
||||
# Load base Docker image and capture the tag
|
||||
echo "Loading base Docker image..."
|
||||
LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load)
|
||||
echo "$${LOAD_OUTPUT}"
|
||||
# Extract the actual loaded image tag from "Loaded image: <tag>" output
|
||||
# This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent
|
||||
BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //')
|
||||
if [ -z "$${BASE_IMAGE_TAG}" ]; then
|
||||
echo "ERROR: Failed to extract image tag from docker load output"
|
||||
echo "Load output was: $${LOAD_OUTPUT}"
|
||||
exit 1
|
||||
fi
|
||||
echo "Loaded base image: $${BASE_IMAGE_TAG}"
|
||||
|
||||
# Prepare base wheels for Docker build context
|
||||
mkdir -p docker/context/base-wheels
|
||||
touch docker/context/base-wheels/.keep
|
||||
cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/
|
||||
echo "Base wheels for vLLM build:"
|
||||
ls -lh docker/context/base-wheels/
|
||||
|
||||
# Get GPU architectures from meta-data
|
||||
PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')"
|
||||
PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}"
|
||||
|
||||
echo "========================================"
|
||||
echo "Building vLLM wheel with:"
|
||||
echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}"
|
||||
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
|
||||
echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}"
|
||||
echo " BASE_IMAGE: $${BASE_IMAGE_TAG}"
|
||||
echo "========================================"
|
||||
|
||||
# Build vLLM wheel using local checkout (REMOTE_VLLM=0)
|
||||
DOCKER_BUILDKIT=1 docker build \
|
||||
--file docker/Dockerfile.rocm \
|
||||
--target export_vllm_wheel_release \
|
||||
--output type=local,dest=rocm-dist \
|
||||
--build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \
|
||||
--build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \
|
||||
--build-arg REMOTE_VLLM=0 \
|
||||
--build-arg GIT_REPO_CHECK=1 \
|
||||
--build-arg USE_SCCACHE=1 \
|
||||
--build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \
|
||||
--build-arg SCCACHE_REGION_NAME=us-west-2 \
|
||||
--build-arg SCCACHE_S3_NO_CREDENTIALS=0 \
|
||||
.
|
||||
|
||||
echo "Built vLLM wheel:"
|
||||
ls -lh rocm-dist/*.whl
|
||||
|
||||
# Copy wheel to artifacts directory
|
||||
mkdir -p artifacts/rocm-vllm-wheel
|
||||
cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/
|
||||
echo "Final vLLM wheel:"
|
||||
ls -lh artifacts/rocm-vllm-wheel/
|
||||
artifact_paths:
|
||||
- "artifacts/rocm-vllm-wheel/*.whl"
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
# ROCm Job 3: Upload Wheels to S3
|
||||
- label: ":s3: Upload ROCm Wheels to S3"
|
||||
id: upload-rocm-wheels
|
||||
depends_on:
|
||||
- step: build-rocm-vllm-wheel
|
||||
allow_failure: false
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
timeout_in_minutes: 60
|
||||
commands:
|
||||
# Download all wheel artifacts and run upload
|
||||
- |
|
||||
set -euo pipefail
|
||||
|
||||
# Check if upload is enabled (from env var, meta-data, or release branch)
|
||||
ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}"
|
||||
if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then
|
||||
# Try to get from meta-data (input form)
|
||||
ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')"
|
||||
fi
|
||||
|
||||
echo "========================================"
|
||||
echo "Upload check:"
|
||||
echo " ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}"
|
||||
echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}"
|
||||
echo "========================================"
|
||||
|
||||
# Skip upload if not enabled
|
||||
if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then
|
||||
echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)"
|
||||
echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration"
|
||||
exit 0
|
||||
fi
|
||||
|
||||
echo "Upload enabled, proceeding..."
|
||||
|
||||
# Download artifacts from current build
|
||||
echo "Downloading artifacts from current build"
|
||||
buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" .
|
||||
buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" .
|
||||
|
||||
# Run upload script
|
||||
bash .buildkite/scripts/upload-rocm-wheels.sh
|
||||
env:
|
||||
DOCKER_BUILDKIT: "1"
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
# ROCm Job 4: Annotate ROCm Wheel Release
|
||||
- label: ":memo: Annotate ROCm wheel release"
|
||||
id: annotate-rocm-release
|
||||
depends_on:
|
||||
- step: upload-rocm-wheels
|
||||
allow_failure: true
|
||||
agents:
|
||||
queue: cpu_queue_postmerge
|
||||
commands:
|
||||
- "bash .buildkite/scripts/annotate-rocm-release.sh"
|
||||
env:
|
||||
S3_BUCKET: "vllm-wheels"
|
||||
|
||||
74
.buildkite/scripts/annotate-rocm-release.sh
Executable file
74
.buildkite/scripts/annotate-rocm-release.sh
Executable file
@@ -0,0 +1,74 @@
|
||||
#!/bin/bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Generate Buildkite annotation for ROCm wheel release
|
||||
|
||||
set -ex
|
||||
|
||||
# Get build configuration from meta-data
|
||||
# Extract ROCm version dynamically from Dockerfile.rocm_base
|
||||
# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1"
|
||||
ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown")
|
||||
PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12")
|
||||
PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151")
|
||||
|
||||
# S3 URLs
|
||||
S3_BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||
S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}"
|
||||
S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com"
|
||||
ROCM_PATH="rocm/${BUILDKITE_COMMIT}"
|
||||
|
||||
buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF
|
||||
## :rocm: ROCm Wheel Release
|
||||
|
||||
### Build Configuration
|
||||
| Setting | Value |
|
||||
|---------|-------|
|
||||
| **ROCm Version** | ${ROCM_VERSION} |
|
||||
| **Python Version** | ${PYTHON_VERSION} |
|
||||
| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} |
|
||||
| **Branch** | \`${BUILDKITE_BRANCH}\` |
|
||||
| **Commit** | \`${BUILDKITE_COMMIT}\` |
|
||||
|
||||
### :package: Installation
|
||||
|
||||
**Install from this build (by commit):**
|
||||
\`\`\`bash
|
||||
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/
|
||||
|
||||
# Example:
|
||||
uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/
|
||||
\`\`\`
|
||||
|
||||
**Install from nightly (if published):**
|
||||
\`\`\`bash
|
||||
uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/
|
||||
\`\`\`
|
||||
|
||||
### :floppy_disk: Download Wheels Directly
|
||||
|
||||
\`\`\`bash
|
||||
# List all ROCm wheels
|
||||
aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/
|
||||
|
||||
# Download specific wheels
|
||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl .
|
||||
aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl .
|
||||
\`\`\`
|
||||
|
||||
### :gear: Included Packages
|
||||
- **vllm**: vLLM with ROCm support
|
||||
- **torch**: PyTorch built for ROCm ${ROCM_VERSION}
|
||||
- **triton_rocm**: Triton built for ROCm
|
||||
- **torchvision**: TorchVision for ROCm PyTorch
|
||||
- **amdsmi**: AMD SMI Python bindings
|
||||
|
||||
### :warning: Notes
|
||||
- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs
|
||||
- Supported GPU architectures: ${PYTORCH_ROCM_ARCH}
|
||||
- Platform: Linux x86_64 only
|
||||
EOF
|
||||
140
.buildkite/scripts/cache-rocm-base-wheels.sh
Executable file
140
.buildkite/scripts/cache-rocm-base-wheels.sh
Executable file
@@ -0,0 +1,140 @@
|
||||
#!/usr/bin/env bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Cache helper for ROCm base wheels
|
||||
#
|
||||
# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.)
|
||||
# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed.
|
||||
#
|
||||
# Usage:
|
||||
# cache-rocm-base-wheels.sh check - Check if cache exists, outputs "hit" or "miss"
|
||||
# cache-rocm-base-wheels.sh upload - Upload wheels to cache
|
||||
# cache-rocm-base-wheels.sh download - Download wheels from cache
|
||||
# cache-rocm-base-wheels.sh key - Output the cache key
|
||||
#
|
||||
# Environment variables:
|
||||
# S3_BUCKET - S3 bucket name (default: vllm-wheels)
|
||||
# PYTHON_VERSION - Python version (affects cache key)
|
||||
# PYTORCH_ROCM_ARCH - GPU architectures (affects cache key)
|
||||
#
|
||||
# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base,
|
||||
# so changes to ROCm version are captured by the Dockerfile hash.
|
||||
|
||||
set -euo pipefail
|
||||
|
||||
BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||
DOCKERFILE="docker/Dockerfile.rocm_base"
|
||||
CACHE_PREFIX="rocm/cache"
|
||||
|
||||
# Generate hash from Dockerfile content + build args
|
||||
generate_cache_key() {
|
||||
# Include Dockerfile content
|
||||
if [[ ! -f "$DOCKERFILE" ]]; then
|
||||
echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2
|
||||
exit 1
|
||||
fi
|
||||
local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16)
|
||||
|
||||
# Include key build args that affect the output
|
||||
# These should match the ARGs in Dockerfile.rocm_base that change the build output
|
||||
# Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash
|
||||
local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}"
|
||||
local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8)
|
||||
|
||||
echo "${dockerfile_hash}-${args_hash}"
|
||||
}
|
||||
|
||||
CACHE_KEY=$(generate_cache_key)
|
||||
CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/"
|
||||
|
||||
case "${1:-}" in
|
||||
check)
|
||||
echo "Checking cache for key: ${CACHE_KEY}" >&2
|
||||
echo "Cache path: ${CACHE_PATH}" >&2
|
||||
echo "Variables used in cache key:" >&2
|
||||
echo " PYTHON_VERSION: ${PYTHON_VERSION:-<not set>}" >&2
|
||||
echo " PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-<not set>}" >&2
|
||||
|
||||
# Check if cache exists by listing objects
|
||||
# We look for at least one .whl file
|
||||
echo "Running: aws s3 ls ${CACHE_PATH}" >&2
|
||||
S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true
|
||||
echo "S3 ls output:" >&2
|
||||
echo "$S3_OUTPUT" | head -5 >&2
|
||||
|
||||
if echo "$S3_OUTPUT" | grep -q "\.whl"; then
|
||||
echo "hit"
|
||||
else
|
||||
echo "miss"
|
||||
fi
|
||||
;;
|
||||
|
||||
upload)
|
||||
echo "========================================"
|
||||
echo "Uploading wheels to cache"
|
||||
echo "========================================"
|
||||
echo "Cache key: ${CACHE_KEY}"
|
||||
echo "Cache path: ${CACHE_PATH}"
|
||||
echo ""
|
||||
|
||||
if [[ ! -d "artifacts/rocm-base-wheels" ]]; then
|
||||
echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
|
||||
if [[ "$WHEEL_COUNT" -eq 0 ]]; then
|
||||
echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2
|
||||
exit 1
|
||||
fi
|
||||
|
||||
echo "Uploading $WHEEL_COUNT wheels..."
|
||||
aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}"
|
||||
|
||||
echo ""
|
||||
echo "Cache upload complete!"
|
||||
echo "========================================"
|
||||
;;
|
||||
|
||||
download)
|
||||
echo "========================================"
|
||||
echo "Downloading wheels from cache"
|
||||
echo "========================================"
|
||||
echo "Cache key: ${CACHE_KEY}"
|
||||
echo "Cache path: ${CACHE_PATH}"
|
||||
echo ""
|
||||
|
||||
mkdir -p artifacts/rocm-base-wheels
|
||||
aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/
|
||||
|
||||
echo ""
|
||||
echo "Downloaded wheels:"
|
||||
ls -lh artifacts/rocm-base-wheels/
|
||||
|
||||
WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l)
|
||||
echo ""
|
||||
echo "Total: $WHEEL_COUNT wheels"
|
||||
echo "========================================"
|
||||
;;
|
||||
|
||||
key)
|
||||
echo "${CACHE_KEY}"
|
||||
;;
|
||||
|
||||
path)
|
||||
echo "${CACHE_PATH}"
|
||||
;;
|
||||
|
||||
*)
|
||||
echo "Usage: $0 {check|upload|download|key|path}" >&2
|
||||
echo "" >&2
|
||||
echo "Commands:" >&2
|
||||
echo " check - Check if cache exists, outputs 'hit' or 'miss'" >&2
|
||||
echo " upload - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2
|
||||
echo " download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2
|
||||
echo " key - Output the cache key" >&2
|
||||
echo " path - Output the full S3 cache path" >&2
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
@@ -16,6 +16,18 @@ from urllib.parse import quote
|
||||
|
||||
import regex as re
|
||||
|
||||
|
||||
def normalize_package_name(name: str) -> str:
|
||||
"""
|
||||
Normalize package name according to PEP 503.
|
||||
https://peps.python.org/pep-0503/#normalized-names
|
||||
|
||||
Replace runs of underscores, hyphens, and periods with a single hyphen,
|
||||
and lowercase the result.
|
||||
"""
|
||||
return re.sub(r"[-_.]+", "-", name).lower()
|
||||
|
||||
|
||||
if not sys.version_info >= (3, 12):
|
||||
raise RuntimeError("This script requires Python 3.12 or higher.")
|
||||
|
||||
@@ -78,7 +90,13 @@ def parse_from_filename(file: str) -> WheelFileInfo:
|
||||
version = version.removesuffix("." + variant)
|
||||
else:
|
||||
if "+" in version:
|
||||
version, variant = version.split("+")
|
||||
version_part, suffix = version.split("+", 1)
|
||||
# Only treat known patterns as variants (rocmXXX, cuXXX, cpu)
|
||||
# Git hashes and other suffixes are NOT variants
|
||||
if suffix.startswith(("rocm", "cu", "cpu")):
|
||||
variant = suffix
|
||||
version = version_part
|
||||
# Otherwise keep the full version string (variant stays None)
|
||||
|
||||
return WheelFileInfo(
|
||||
package_name=package_name,
|
||||
@@ -206,6 +224,26 @@ def generate_index_and_metadata(
|
||||
print("No wheel files found, skipping index generation.")
|
||||
return
|
||||
|
||||
# For ROCm builds: inherit variant from vllm wheel
|
||||
# All ROCm wheels should share the same variant as vllm
|
||||
rocm_variant = None
|
||||
for file in parsed_files:
|
||||
if (
|
||||
file.package_name == "vllm"
|
||||
and file.variant
|
||||
and file.variant.startswith("rocm")
|
||||
):
|
||||
rocm_variant = file.variant
|
||||
print(f"Detected ROCm variant from vllm: {rocm_variant}")
|
||||
break
|
||||
|
||||
# Apply ROCm variant to all wheels without a variant
|
||||
if rocm_variant:
|
||||
for file in parsed_files:
|
||||
if file.variant is None:
|
||||
file.variant = rocm_variant
|
||||
print(f"Inherited variant '{rocm_variant}' for {file.filename}")
|
||||
|
||||
# Group by variant
|
||||
variant_to_files: dict[str, list[WheelFileInfo]] = {}
|
||||
for file in parsed_files:
|
||||
@@ -256,8 +294,8 @@ def generate_index_and_metadata(
|
||||
|
||||
variant_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# gather all package names in this variant
|
||||
packages = set(f.package_name for f in files)
|
||||
# gather all package names in this variant (normalized per PEP 503)
|
||||
packages = set(normalize_package_name(f.package_name) for f in files)
|
||||
if variant == "default":
|
||||
# these packages should also appear in the "project list"
|
||||
# generate after all variants are processed
|
||||
@@ -269,8 +307,10 @@ def generate_index_and_metadata(
|
||||
f.write(project_list_str)
|
||||
|
||||
for package in packages:
|
||||
# filter files belonging to this package only
|
||||
package_files = [f for f in files if f.package_name == package]
|
||||
# filter files belonging to this package only (compare normalized names)
|
||||
package_files = [
|
||||
f for f in files if normalize_package_name(f.package_name) == package
|
||||
]
|
||||
package_dir = variant_dir / package
|
||||
package_dir.mkdir(parents=True, exist_ok=True)
|
||||
index_str, metadata_str = generate_package_index_and_metadata(
|
||||
@@ -341,8 +381,13 @@ if __name__ == "__main__":
|
||||
args = parser.parse_args()
|
||||
|
||||
version = args.version
|
||||
if "/" in version or "\\" in version:
|
||||
raise ValueError("Version string must not contain slashes.")
|
||||
# Allow rocm/ prefix, reject other slashes and all backslashes
|
||||
if "\\" in version:
|
||||
raise ValueError("Version string must not contain backslashes.")
|
||||
if "/" in version and not version.startswith("rocm/"):
|
||||
raise ValueError(
|
||||
"Version string must not contain slashes (except for 'rocm/' prefix)."
|
||||
)
|
||||
current_objects_path = Path(args.current_objects)
|
||||
output_dir = Path(args.output_dir)
|
||||
if not output_dir.exists():
|
||||
@@ -393,8 +438,23 @@ if __name__ == "__main__":
|
||||
# Generate index and metadata, assuming wheels and indices are stored as:
|
||||
# s3://vllm-wheels/{wheel_dir}/<wheel files>
|
||||
# s3://vllm-wheels/<anything>/<index files>
|
||||
wheel_dir = args.wheel_dir or version
|
||||
wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/")
|
||||
#
|
||||
# For ROCm builds, version is "rocm/{commit}" and indices are uploaded to:
|
||||
# - rocm/{commit}/ (same as wheels)
|
||||
# - rocm/nightly/
|
||||
# - rocm/{version}/
|
||||
# All these are under the "rocm/" prefix, so relative paths should be
|
||||
# relative to "rocm/", not the bucket root.
|
||||
if args.wheel_dir:
|
||||
# Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir)
|
||||
wheel_dir = args.wheel_dir.strip().rstrip("/")
|
||||
elif version.startswith("rocm/"):
|
||||
# For rocm/commit, wheel_base_dir should be just the commit part
|
||||
# so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/
|
||||
wheel_dir = version.split("/", 1)[1]
|
||||
else:
|
||||
wheel_dir = version
|
||||
wheel_base_dir = Path(output_dir).parent / wheel_dir
|
||||
index_base_dir = Path(output_dir)
|
||||
|
||||
generate_index_and_metadata(
|
||||
|
||||
151
.buildkite/scripts/upload-rocm-wheels.sh
Executable file
151
.buildkite/scripts/upload-rocm-wheels.sh
Executable file
@@ -0,0 +1,151 @@
|
||||
#!/usr/bin/env bash
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
#
|
||||
# Upload ROCm wheels to S3 with proper index generation
|
||||
#
|
||||
# Required environment variables:
|
||||
# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role)
|
||||
# S3_BUCKET (default: vllm-wheels)
|
||||
#
|
||||
# S3 path structure:
|
||||
# s3://vllm-wheels/rocm/{commit}/ - All wheels for this commit
|
||||
# s3://vllm-wheels/rocm/nightly/ - Index pointing to latest nightly
|
||||
# s3://vllm-wheels/rocm/{version}/ - Index for release versions
|
||||
|
||||
set -ex
|
||||
|
||||
# ======== Configuration ========
|
||||
BUCKET="${S3_BUCKET:-vllm-wheels}"
|
||||
ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}"
|
||||
S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/"
|
||||
INDICES_OUTPUT_DIR="rocm-indices"
|
||||
PYTHON="${PYTHON_PROG:-python3}"
|
||||
|
||||
# ROCm uses manylinux_2_35 (Ubuntu 22.04 based)
|
||||
MANYLINUX_VERSION="manylinux_2_35"
|
||||
|
||||
echo "========================================"
|
||||
echo "ROCm Wheel Upload Configuration"
|
||||
echo "========================================"
|
||||
echo "S3 Bucket: $BUCKET"
|
||||
echo "S3 Path: $ROCM_SUBPATH"
|
||||
echo "Commit: $BUILDKITE_COMMIT"
|
||||
echo "Branch: $BUILDKITE_BRANCH"
|
||||
echo "========================================"
|
||||
|
||||
# ======== Part 0: Setup Python ========
|
||||
|
||||
# Detect if python3.12+ is available
|
||||
has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0)
|
||||
if [[ "$has_new_python" -eq 0 ]]; then
|
||||
# Use new python from docker
|
||||
# Use --user to ensure files are created with correct ownership (not root)
|
||||
docker pull python:3-slim
|
||||
PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3"
|
||||
fi
|
||||
|
||||
echo "Using python interpreter: $PYTHON"
|
||||
echo "Python version: $($PYTHON --version)"
|
||||
|
||||
# ======== Part 1: Collect and prepare wheels ========
|
||||
|
||||
# Collect all wheels
|
||||
mkdir -p all-rocm-wheels
|
||||
cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true
|
||||
cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true
|
||||
|
||||
WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l)
|
||||
echo "Total wheels to upload: $WHEEL_COUNT"
|
||||
|
||||
if [ "$WHEEL_COUNT" -eq 0 ]; then
|
||||
echo "ERROR: No wheels found to upload!"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Rename linux to manylinux in wheel filenames
|
||||
for wheel in all-rocm-wheels/*.whl; do
|
||||
if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then
|
||||
new_wheel="${wheel/linux/$MANYLINUX_VERSION}"
|
||||
mv -- "$wheel" "$new_wheel"
|
||||
echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")"
|
||||
fi
|
||||
done
|
||||
|
||||
echo ""
|
||||
echo "Wheels to upload:"
|
||||
ls -lh all-rocm-wheels/
|
||||
|
||||
# ======== Part 2: Upload wheels to S3 ========
|
||||
|
||||
echo ""
|
||||
echo "Uploading wheels to $S3_COMMIT_PREFIX"
|
||||
for wheel in all-rocm-wheels/*.whl; do
|
||||
aws s3 cp "$wheel" "$S3_COMMIT_PREFIX"
|
||||
done
|
||||
|
||||
# ======== Part 3: Generate and upload indices ========
|
||||
|
||||
# List existing wheels in commit directory
|
||||
echo ""
|
||||
echo "Generating indices..."
|
||||
obj_json="rocm-objects.json"
|
||||
aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ROCM_SUBPATH/" --delimiter / --output json > "$obj_json"
|
||||
|
||||
mkdir -p "$INDICES_OUTPUT_DIR"
|
||||
|
||||
# Use the existing generate-nightly-index.py
|
||||
# HACK: Replace regex module with stdlib re (same as CUDA script)
|
||||
sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py
|
||||
|
||||
$PYTHON .buildkite/scripts/generate-nightly-index.py \
|
||||
--version "$ROCM_SUBPATH" \
|
||||
--current-objects "$obj_json" \
|
||||
--output-dir "$INDICES_OUTPUT_DIR" \
|
||||
--comment "ROCm commit $BUILDKITE_COMMIT"
|
||||
|
||||
# Upload indices to commit directory
|
||||
echo "Uploading indices to $S3_COMMIT_PREFIX"
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX"
|
||||
|
||||
# Update rocm/nightly/ if on main branch and not a PR
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] || [[ "$NIGHTLY" == "1" ]]; then
|
||||
echo "Updating rocm/nightly/ index..."
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/nightly/"
|
||||
fi
|
||||
|
||||
# Extract version from vLLM wheel and update version-specific index
|
||||
VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1)
|
||||
if [ -n "$VLLM_WHEEL" ]; then
|
||||
VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2)
|
||||
echo "Version in wheel: $VERSION"
|
||||
PURE_VERSION="${VERSION%%+*}"
|
||||
PURE_VERSION="${PURE_VERSION%%.rocm}"
|
||||
echo "Pure version: $PURE_VERSION"
|
||||
|
||||
if [[ "$VERSION" != *"dev"* ]]; then
|
||||
echo "Updating rocm/$PURE_VERSION/ index..."
|
||||
aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/$PURE_VERSION/"
|
||||
fi
|
||||
fi
|
||||
|
||||
# ======== Part 4: Summary ========
|
||||
|
||||
echo ""
|
||||
echo "========================================"
|
||||
echo "ROCm Wheel Upload Complete!"
|
||||
echo "========================================"
|
||||
echo ""
|
||||
echo "Wheels available at:"
|
||||
echo " s3://$BUCKET/$ROCM_SUBPATH/"
|
||||
echo ""
|
||||
echo "Install command (by commit):"
|
||||
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/$ROCM_SUBPATH/"
|
||||
echo ""
|
||||
if [[ "$BUILDKITE_BRANCH" == "main" ]] || [[ "$NIGHTLY" == "1" ]]; then
|
||||
echo "Install command (nightly):"
|
||||
echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/rocm/nightly/"
|
||||
fi
|
||||
echo ""
|
||||
echo "Wheel count: $WHEEL_COUNT"
|
||||
echo "========================================"
|
||||
Reference in New Issue
Block a user