diff --git a/.buildkite/release-pipeline.yaml b/.buildkite/release-pipeline.yaml index cc1ca1807..36a91bb14 100644 --- a/.buildkite/release-pipeline.yaml +++ b/.buildkite/release-pipeline.yaml @@ -214,3 +214,365 @@ steps: env: DOCKER_BUILDKIT: "1" DOCKERHUB_USERNAME: "vllmbot" + + # ============================================================================= + # ROCm Release Pipeline (x86_64 only) + # ============================================================================= + # + # vLLM version is determined by the Buildkite checkout (like CUDA pipeline). + # To build a specific version, trigger the build from that branch/tag. + # + # Environment variables for ROCm builds (set via Buildkite UI or schedule): + # ROCM_PYTHON_VERSION: Python version (default: 3.12) + # PYTORCH_ROCM_ARCH: GPU architectures (default: gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151) + # ROCM_UPLOAD_WHEELS: Upload to S3 (default: false for nightly, true for releases) + # ROCM_FORCE_REBUILD: Force rebuild base wheels, ignore S3 cache (default: false) + # + # Note: ROCm version is determined by BASE_IMAGE in docker/Dockerfile.rocm_base + # (currently rocm/dev-ubuntu-22.04:7.1-complete) + # + # ============================================================================= + + # ROCm Input Step - Collect build configuration (manual trigger only) + - input: "ROCm Wheel Release Build Configuration" + key: input-rocm-config + depends_on: ~ + if: build.source == "ui" + fields: + - text: "Python Version" + key: "rocm-python-version" + default: "3.12" + hint: "Python version (e.g., 3.12)" + - text: "GPU Architectures" + key: "rocm-pytorch-rocm-arch" + default: "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151" + hint: "Semicolon-separated GPU architectures" + - select: "Upload Wheels to S3" + key: "rocm-upload-wheels" + default: "true" + options: + - label: "No - Build only (nightly/dev)" + value: "false" + - label: "Yes - Upload to S3 (release)" + value: "true" + - select: "Force Rebuild Base Wheels" + key: "rocm-force-rebuild" + default: "false" + hint: "Ignore S3 cache and rebuild base wheels from scratch" + options: + - label: "No - Use cached wheels if available" + value: "false" + - label: "Yes - Rebuild even if cache exists" + value: "true" + + # ROCm Job 1: Build ROCm Base Wheels (with S3 caching) + - label: ":rocm: Build ROCm Base Wheels" + id: build-rocm-base-wheels + depends_on: + - step: input-rocm-config + allow_failure: true # Allow failure so non-UI builds can proceed (input step is skipped) + agents: + queue: cpu_queue_postmerge + commands: + # Set configuration and check cache + - | + set -euo pipefail + + # Get values from meta-data (set by input step) or use defaults + PYTHON_VERSION="$$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo '')" + export PYTHON_VERSION="$${PYTHON_VERSION:-3.12}" + + PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')" + export PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}" + + # Check for force rebuild flag + ROCM_FORCE_REBUILD="$${ROCM_FORCE_REBUILD:-}" + if [ -z "$${ROCM_FORCE_REBUILD}" ]; then + ROCM_FORCE_REBUILD="$$(buildkite-agent meta-data get rocm-force-rebuild 2>/dev/null || echo '')" + fi + + echo "========================================" + echo "ROCm Base Wheels Build Configuration" + echo "========================================" + echo " PYTHON_VERSION: $${PYTHON_VERSION}" + echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}" + echo " ROCM_FORCE_REBUILD: $${ROCM_FORCE_REBUILD:-false}" + echo "========================================" + + # Save resolved config for later jobs + buildkite-agent meta-data set "rocm-python-version" "$${PYTHON_VERSION}" + buildkite-agent meta-data set "rocm-pytorch-rocm-arch" "$${PYTORCH_ROCM_ARCH}" + + # Check S3 cache for pre-built wheels + CACHE_KEY=$$(.buildkite/scripts/cache-rocm-base-wheels.sh key) + CACHE_PATH=$$(.buildkite/scripts/cache-rocm-base-wheels.sh path) + echo "" + echo "Cache key: $${CACHE_KEY}" + echo "Cache path: $${CACHE_PATH}" + + # Save cache key for downstream jobs + buildkite-agent meta-data set "rocm-cache-key" "$${CACHE_KEY}" + + CACHE_STATUS="miss" + if [ "$${ROCM_FORCE_REBUILD}" != "true" ]; then + CACHE_STATUS=$$(.buildkite/scripts/cache-rocm-base-wheels.sh check) + else + echo "Force rebuild requested, skipping cache check" + fi + + if [ "$${CACHE_STATUS}" = "hit" ]; then + echo "" + echo "CACHE HIT! Downloading pre-built wheels..." + echo "" + .buildkite/scripts/cache-rocm-base-wheels.sh download + + # Set the S3 path for the cached Docker image (for Job 2 to download) + S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}" + buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz" + + # Mark that we used cache (for Docker image handling) + buildkite-agent meta-data set "rocm-used-cache" "true" + + echo "" + echo "Cache download complete. Skipping Docker build." + echo "Docker image will be downloaded from: $${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz" + else + echo "" + echo "CACHE MISS. Building from scratch..." + echo "" + + # Build full base image (for later vLLM build) + DOCKER_BUILDKIT=1 docker buildx build \ + --file docker/Dockerfile.rocm_base \ + --tag rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} \ + --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ + --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \ + --build-arg USE_SCCACHE=1 \ + --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ + --build-arg SCCACHE_REGION_NAME=us-west-2 \ + --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ + --load \ + . + + # Build debs_wheel_release stage for wheel extraction + DOCKER_BUILDKIT=1 docker buildx build \ + --file docker/Dockerfile.rocm_base \ + --tag rocm-base-debs:$${BUILDKITE_BUILD_NUMBER} \ + --target debs_wheel_release \ + --build-arg PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ + --build-arg PYTHON_VERSION="$${PYTHON_VERSION}" \ + --build-arg USE_SCCACHE=1 \ + --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ + --build-arg SCCACHE_REGION_NAME=us-west-2 \ + --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ + --load \ + . + + # Extract wheels from Docker image + mkdir -p artifacts/rocm-base-wheels + container_id=$$(docker create rocm-base-debs:$${BUILDKITE_BUILD_NUMBER}) + docker cp $${container_id}:/app/debs/. artifacts/rocm-base-wheels/ + docker rm $${container_id} + echo "Extracted base wheels:" + ls -lh artifacts/rocm-base-wheels/ + + # Upload wheels to S3 cache for future builds + echo "" + echo "Uploading wheels to S3 cache..." + .buildkite/scripts/cache-rocm-base-wheels.sh upload + + # Export base Docker image for reuse in vLLM build + mkdir -p artifacts/rocm-docker-image + docker save rocm/vllm-dev:base-$${BUILDKITE_BUILD_NUMBER} | gzip > artifacts/rocm-docker-image/rocm-base-image.tar.gz + echo "Docker image size:" + ls -lh artifacts/rocm-docker-image/ + + # Upload large Docker image to S3 (also cached by cache key) + S3_ARTIFACT_PATH="s3://$${S3_BUCKET}/rocm/cache/$${CACHE_KEY}" + echo "Uploading Docker image to $${S3_ARTIFACT_PATH}/" + aws s3 cp artifacts/rocm-docker-image/rocm-base-image.tar.gz "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz" + + # Save the S3 path for downstream jobs + buildkite-agent meta-data set "rocm-docker-image-s3-path" "$${S3_ARTIFACT_PATH}/rocm-base-image.tar.gz" + + # Mark that we did NOT use cache + buildkite-agent meta-data set "rocm-used-cache" "false" + + echo "" + echo "Build complete. Wheels cached for future builds." + fi + artifact_paths: + - "artifacts/rocm-base-wheels/*.whl" + env: + DOCKER_BUILDKIT: "1" + S3_BUCKET: "vllm-wheels" + + # ROCm Job 2: Build vLLM ROCm Wheel + - label: ":python: Build vLLM ROCm Wheel" + id: build-rocm-vllm-wheel + depends_on: + - step: build-rocm-base-wheels + allow_failure: false + agents: + queue: cpu_queue_postmerge + timeout_in_minutes: 180 + commands: + # Download artifacts and prepare Docker image + - | + set -euo pipefail + + # Ensure git tags are up-to-date (Buildkite's default fetch doesn't update tags) + # This fixes version detection when tags are moved/force-pushed + echo "Fetching latest tags from origin..." + git fetch --tags --force origin + + # Log tag information for debugging version detection + echo "========================================" + echo "Git Tag Verification" + echo "========================================" + echo "Current HEAD: $(git rev-parse HEAD)" + echo "git describe --tags: $(git describe --tags 2>/dev/null || echo 'No tags found')" + echo "" + echo "Recent tags (pointing to commits near HEAD):" + git tag -l --sort=-creatordate | head -5 + echo "setuptools_scm version detection:" + pip install -q setuptools_scm 2>/dev/null || true + python3 -c "import setuptools_scm; print(' Detected version:', setuptools_scm.get_version())" 2>/dev/null || echo " (setuptools_scm not available in this environment)" + echo "========================================" + + # Download wheel artifacts from current build + echo "Downloading wheel artifacts from current build" + buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" . + + # Download Docker image from S3 (too large for Buildkite artifacts) + DOCKER_IMAGE_S3_PATH="$$(buildkite-agent meta-data get rocm-docker-image-s3-path 2>/dev/null || echo '')" + if [ -z "$${DOCKER_IMAGE_S3_PATH}" ]; then + echo "ERROR: rocm-docker-image-s3-path metadata not found" + echo "This should have been set by the build-rocm-base-wheels job" + exit 1 + fi + echo "Downloading Docker image from $${DOCKER_IMAGE_S3_PATH}" + mkdir -p artifacts/rocm-docker-image + aws s3 cp "$${DOCKER_IMAGE_S3_PATH}" artifacts/rocm-docker-image/rocm-base-image.tar.gz + + # Load base Docker image and capture the tag + echo "Loading base Docker image..." + LOAD_OUTPUT=$$(gunzip -c artifacts/rocm-docker-image/rocm-base-image.tar.gz | docker load) + echo "$${LOAD_OUTPUT}" + # Extract the actual loaded image tag from "Loaded image: " output + # This avoids picking up stale images (like rocm/vllm-dev:nightly) already on the agent + BASE_IMAGE_TAG=$$(echo "$${LOAD_OUTPUT}" | grep "Loaded image:" | sed 's/Loaded image: //') + if [ -z "$${BASE_IMAGE_TAG}" ]; then + echo "ERROR: Failed to extract image tag from docker load output" + echo "Load output was: $${LOAD_OUTPUT}" + exit 1 + fi + echo "Loaded base image: $${BASE_IMAGE_TAG}" + + # Prepare base wheels for Docker build context + mkdir -p docker/context/base-wheels + touch docker/context/base-wheels/.keep + cp artifacts/rocm-base-wheels/*.whl docker/context/base-wheels/ + echo "Base wheels for vLLM build:" + ls -lh docker/context/base-wheels/ + + # Get GPU architectures from meta-data + PYTORCH_ROCM_ARCH="$$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo '')" + PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH:-gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151}" + + echo "========================================" + echo "Building vLLM wheel with:" + echo " BUILDKITE_COMMIT: $${BUILDKITE_COMMIT}" + echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}" + echo " PYTORCH_ROCM_ARCH: $${PYTORCH_ROCM_ARCH}" + echo " BASE_IMAGE: $${BASE_IMAGE_TAG}" + echo "========================================" + + # Build vLLM wheel using local checkout (REMOTE_VLLM=0) + DOCKER_BUILDKIT=1 docker build \ + --file docker/Dockerfile.rocm \ + --target export_vllm_wheel_release \ + --output type=local,dest=rocm-dist \ + --build-arg BASE_IMAGE="$${BASE_IMAGE_TAG}" \ + --build-arg ARG_PYTORCH_ROCM_ARCH="$${PYTORCH_ROCM_ARCH}" \ + --build-arg REMOTE_VLLM=0 \ + --build-arg GIT_REPO_CHECK=1 \ + --build-arg USE_SCCACHE=1 \ + --build-arg SCCACHE_BUCKET_NAME=vllm-build-sccache \ + --build-arg SCCACHE_REGION_NAME=us-west-2 \ + --build-arg SCCACHE_S3_NO_CREDENTIALS=0 \ + . + + echo "Built vLLM wheel:" + ls -lh rocm-dist/*.whl + + # Copy wheel to artifacts directory + mkdir -p artifacts/rocm-vllm-wheel + cp rocm-dist/*.whl artifacts/rocm-vllm-wheel/ + echo "Final vLLM wheel:" + ls -lh artifacts/rocm-vllm-wheel/ + artifact_paths: + - "artifacts/rocm-vllm-wheel/*.whl" + env: + DOCKER_BUILDKIT: "1" + S3_BUCKET: "vllm-wheels" + + # ROCm Job 3: Upload Wheels to S3 + - label: ":s3: Upload ROCm Wheels to S3" + id: upload-rocm-wheels + depends_on: + - step: build-rocm-vllm-wheel + allow_failure: false + agents: + queue: cpu_queue_postmerge + timeout_in_minutes: 60 + commands: + # Download all wheel artifacts and run upload + - | + set -euo pipefail + + # Check if upload is enabled (from env var, meta-data, or release branch) + ROCM_UPLOAD_WHEELS="$${ROCM_UPLOAD_WHEELS:-}" + if [ -z "$${ROCM_UPLOAD_WHEELS}" ]; then + # Try to get from meta-data (input form) + ROCM_UPLOAD_WHEELS="$$(buildkite-agent meta-data get rocm-upload-wheels 2>/dev/null || echo '')" + fi + + echo "========================================" + echo "Upload check:" + echo " ROCM_UPLOAD_WHEELS: $${ROCM_UPLOAD_WHEELS}" + echo " BUILDKITE_BRANCH: $${BUILDKITE_BRANCH}" + echo "========================================" + + # Skip upload if not enabled + if [ "$${ROCM_UPLOAD_WHEELS}" != "true" ]; then + echo "Skipping S3 upload (ROCM_UPLOAD_WHEELS != true, NIGHTLY != 1, not a release branch)" + echo "To enable upload, set 'Upload Wheels to S3' to 'Yes' in the build configuration" + exit 0 + fi + + echo "Upload enabled, proceeding..." + + # Download artifacts from current build + echo "Downloading artifacts from current build" + buildkite-agent artifact download "artifacts/rocm-base-wheels/*.whl" . + buildkite-agent artifact download "artifacts/rocm-vllm-wheel/*.whl" . + + # Run upload script + bash .buildkite/scripts/upload-rocm-wheels.sh + env: + DOCKER_BUILDKIT: "1" + S3_BUCKET: "vllm-wheels" + + # ROCm Job 4: Annotate ROCm Wheel Release + - label: ":memo: Annotate ROCm wheel release" + id: annotate-rocm-release + depends_on: + - step: upload-rocm-wheels + allow_failure: true + agents: + queue: cpu_queue_postmerge + commands: + - "bash .buildkite/scripts/annotate-rocm-release.sh" + env: + S3_BUCKET: "vllm-wheels" diff --git a/.buildkite/scripts/annotate-rocm-release.sh b/.buildkite/scripts/annotate-rocm-release.sh new file mode 100755 index 000000000..fcc7c290e --- /dev/null +++ b/.buildkite/scripts/annotate-rocm-release.sh @@ -0,0 +1,74 @@ +#!/bin/bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Generate Buildkite annotation for ROCm wheel release + +set -ex + +# Get build configuration from meta-data +# Extract ROCm version dynamically from Dockerfile.rocm_base +# BASE_IMAGE format: rocm/dev-ubuntu-22.04:7.1-complete -> extracts "7.1" +ROCM_VERSION=$(grep -E '^ARG BASE_IMAGE=' docker/Dockerfile.rocm_base | sed -E 's/.*:([0-9]+\.[0-9]+).*/\1/' || echo "unknown") +PYTHON_VERSION=$(buildkite-agent meta-data get rocm-python-version 2>/dev/null || echo "3.12") +PYTORCH_ROCM_ARCH=$(buildkite-agent meta-data get rocm-pytorch-rocm-arch 2>/dev/null || echo "gfx90a;gfx942;gfx950;gfx1100;gfx1101;gfx1200;gfx1201;gfx1150;gfx1151") + +# S3 URLs +S3_BUCKET="${S3_BUCKET:-vllm-wheels}" +S3_REGION="${AWS_DEFAULT_REGION:-us-west-2}" +S3_URL="https://${S3_BUCKET}.s3.${S3_REGION}.amazonaws.com" +ROCM_PATH="rocm/${BUILDKITE_COMMIT}" + +buildkite-agent annotate --style 'success' --context 'rocm-release-workflow' << EOF +## :rocm: ROCm Wheel Release + +### Build Configuration +| Setting | Value | +|---------|-------| +| **ROCm Version** | ${ROCM_VERSION} | +| **Python Version** | ${PYTHON_VERSION} | +| **GPU Architectures** | ${PYTORCH_ROCM_ARCH} | +| **Branch** | \`${BUILDKITE_BRANCH}\` | +| **Commit** | \`${BUILDKITE_COMMIT}\` | + +### :package: Installation + +**Install from this build (by commit):** +\`\`\`bash +uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/{rocm_variant}/ + +# Example: +uv pip install vllm --extra-index-url ${S3_URL}/${ROCM_PATH}/rocm700/ +\`\`\` + +**Install from nightly (if published):** +\`\`\`bash +uv pip install vllm --extra-index-url ${S3_URL}/rocm/nightly/ +\`\`\` + +### :floppy_disk: Download Wheels Directly + +\`\`\`bash +# List all ROCm wheels +aws s3 ls s3://${S3_BUCKET}/${ROCM_PATH}/ + +# Download specific wheels +aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/vllm-*.whl . +aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torch-*.whl . +aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/triton_rocm-*.whl . +aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/torchvision-*.whl . +aws s3 cp s3://${S3_BUCKET}/${ROCM_PATH}/amdsmi-*.whl . +\`\`\` + +### :gear: Included Packages +- **vllm**: vLLM with ROCm support +- **torch**: PyTorch built for ROCm ${ROCM_VERSION} +- **triton_rocm**: Triton built for ROCm +- **torchvision**: TorchVision for ROCm PyTorch +- **amdsmi**: AMD SMI Python bindings + +### :warning: Notes +- These wheels are built for **ROCm ${ROCM_VERSION}** and will NOT work with CUDA GPUs +- Supported GPU architectures: ${PYTORCH_ROCM_ARCH} +- Platform: Linux x86_64 only +EOF diff --git a/.buildkite/scripts/cache-rocm-base-wheels.sh b/.buildkite/scripts/cache-rocm-base-wheels.sh new file mode 100755 index 000000000..be2447250 --- /dev/null +++ b/.buildkite/scripts/cache-rocm-base-wheels.sh @@ -0,0 +1,140 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Cache helper for ROCm base wheels +# +# This script manages caching of pre-built ROCm base wheels (torch, triton, etc.) +# to avoid rebuilding them when Dockerfile.rocm_base hasn't changed. +# +# Usage: +# cache-rocm-base-wheels.sh check - Check if cache exists, outputs "hit" or "miss" +# cache-rocm-base-wheels.sh upload - Upload wheels to cache +# cache-rocm-base-wheels.sh download - Download wheels from cache +# cache-rocm-base-wheels.sh key - Output the cache key +# +# Environment variables: +# S3_BUCKET - S3 bucket name (default: vllm-wheels) +# PYTHON_VERSION - Python version (affects cache key) +# PYTORCH_ROCM_ARCH - GPU architectures (affects cache key) +# +# Note: ROCm version is determined by BASE_IMAGE in Dockerfile.rocm_base, +# so changes to ROCm version are captured by the Dockerfile hash. + +set -euo pipefail + +BUCKET="${S3_BUCKET:-vllm-wheels}" +DOCKERFILE="docker/Dockerfile.rocm_base" +CACHE_PREFIX="rocm/cache" + +# Generate hash from Dockerfile content + build args +generate_cache_key() { + # Include Dockerfile content + if [[ ! -f "$DOCKERFILE" ]]; then + echo "ERROR: Dockerfile not found: $DOCKERFILE" >&2 + exit 1 + fi + local dockerfile_hash=$(sha256sum "$DOCKERFILE" | cut -c1-16) + + # Include key build args that affect the output + # These should match the ARGs in Dockerfile.rocm_base that change the build output + # Note: ROCm version is determined by BASE_IMAGE in the Dockerfile, so it's captured by dockerfile_hash + local args_string="${PYTHON_VERSION:-}|${PYTORCH_ROCM_ARCH:-}" + local args_hash=$(echo "$args_string" | sha256sum | cut -c1-8) + + echo "${dockerfile_hash}-${args_hash}" +} + +CACHE_KEY=$(generate_cache_key) +CACHE_PATH="s3://${BUCKET}/${CACHE_PREFIX}/${CACHE_KEY}/" + +case "${1:-}" in + check) + echo "Checking cache for key: ${CACHE_KEY}" >&2 + echo "Cache path: ${CACHE_PATH}" >&2 + echo "Variables used in cache key:" >&2 + echo " PYTHON_VERSION: ${PYTHON_VERSION:-}" >&2 + echo " PYTORCH_ROCM_ARCH: ${PYTORCH_ROCM_ARCH:-}" >&2 + + # Check if cache exists by listing objects + # We look for at least one .whl file + echo "Running: aws s3 ls ${CACHE_PATH}" >&2 + S3_OUTPUT=$(aws s3 ls "${CACHE_PATH}" 2>&1) || true + echo "S3 ls output:" >&2 + echo "$S3_OUTPUT" | head -5 >&2 + + if echo "$S3_OUTPUT" | grep -q "\.whl"; then + echo "hit" + else + echo "miss" + fi + ;; + + upload) + echo "========================================" + echo "Uploading wheels to cache" + echo "========================================" + echo "Cache key: ${CACHE_KEY}" + echo "Cache path: ${CACHE_PATH}" + echo "" + + if [[ ! -d "artifacts/rocm-base-wheels" ]]; then + echo "ERROR: artifacts/rocm-base-wheels directory not found" >&2 + exit 1 + fi + + WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l) + if [[ "$WHEEL_COUNT" -eq 0 ]]; then + echo "ERROR: No wheels found in artifacts/rocm-base-wheels/" >&2 + exit 1 + fi + + echo "Uploading $WHEEL_COUNT wheels..." + aws s3 cp --recursive artifacts/rocm-base-wheels/ "${CACHE_PATH}" + + echo "" + echo "Cache upload complete!" + echo "========================================" + ;; + + download) + echo "========================================" + echo "Downloading wheels from cache" + echo "========================================" + echo "Cache key: ${CACHE_KEY}" + echo "Cache path: ${CACHE_PATH}" + echo "" + + mkdir -p artifacts/rocm-base-wheels + aws s3 cp --recursive "${CACHE_PATH}" artifacts/rocm-base-wheels/ + + echo "" + echo "Downloaded wheels:" + ls -lh artifacts/rocm-base-wheels/ + + WHEEL_COUNT=$(ls artifacts/rocm-base-wheels/*.whl 2>/dev/null | wc -l) + echo "" + echo "Total: $WHEEL_COUNT wheels" + echo "========================================" + ;; + + key) + echo "${CACHE_KEY}" + ;; + + path) + echo "${CACHE_PATH}" + ;; + + *) + echo "Usage: $0 {check|upload|download|key|path}" >&2 + echo "" >&2 + echo "Commands:" >&2 + echo " check - Check if cache exists, outputs 'hit' or 'miss'" >&2 + echo " upload - Upload wheels from artifacts/rocm-base-wheels/ to cache" >&2 + echo " download - Download wheels from cache to artifacts/rocm-base-wheels/" >&2 + echo " key - Output the cache key" >&2 + echo " path - Output the full S3 cache path" >&2 + exit 1 + ;; +esac diff --git a/.buildkite/scripts/generate-nightly-index.py b/.buildkite/scripts/generate-nightly-index.py index 1794df947..2eb421140 100644 --- a/.buildkite/scripts/generate-nightly-index.py +++ b/.buildkite/scripts/generate-nightly-index.py @@ -16,6 +16,18 @@ from urllib.parse import quote import regex as re + +def normalize_package_name(name: str) -> str: + """ + Normalize package name according to PEP 503. + https://peps.python.org/pep-0503/#normalized-names + + Replace runs of underscores, hyphens, and periods with a single hyphen, + and lowercase the result. + """ + return re.sub(r"[-_.]+", "-", name).lower() + + if not sys.version_info >= (3, 12): raise RuntimeError("This script requires Python 3.12 or higher.") @@ -78,7 +90,13 @@ def parse_from_filename(file: str) -> WheelFileInfo: version = version.removesuffix("." + variant) else: if "+" in version: - version, variant = version.split("+") + version_part, suffix = version.split("+", 1) + # Only treat known patterns as variants (rocmXXX, cuXXX, cpu) + # Git hashes and other suffixes are NOT variants + if suffix.startswith(("rocm", "cu", "cpu")): + variant = suffix + version = version_part + # Otherwise keep the full version string (variant stays None) return WheelFileInfo( package_name=package_name, @@ -206,6 +224,26 @@ def generate_index_and_metadata( print("No wheel files found, skipping index generation.") return + # For ROCm builds: inherit variant from vllm wheel + # All ROCm wheels should share the same variant as vllm + rocm_variant = None + for file in parsed_files: + if ( + file.package_name == "vllm" + and file.variant + and file.variant.startswith("rocm") + ): + rocm_variant = file.variant + print(f"Detected ROCm variant from vllm: {rocm_variant}") + break + + # Apply ROCm variant to all wheels without a variant + if rocm_variant: + for file in parsed_files: + if file.variant is None: + file.variant = rocm_variant + print(f"Inherited variant '{rocm_variant}' for {file.filename}") + # Group by variant variant_to_files: dict[str, list[WheelFileInfo]] = {} for file in parsed_files: @@ -256,8 +294,8 @@ def generate_index_and_metadata( variant_dir.mkdir(parents=True, exist_ok=True) - # gather all package names in this variant - packages = set(f.package_name for f in files) + # gather all package names in this variant (normalized per PEP 503) + packages = set(normalize_package_name(f.package_name) for f in files) if variant == "default": # these packages should also appear in the "project list" # generate after all variants are processed @@ -269,8 +307,10 @@ def generate_index_and_metadata( f.write(project_list_str) for package in packages: - # filter files belonging to this package only - package_files = [f for f in files if f.package_name == package] + # filter files belonging to this package only (compare normalized names) + package_files = [ + f for f in files if normalize_package_name(f.package_name) == package + ] package_dir = variant_dir / package package_dir.mkdir(parents=True, exist_ok=True) index_str, metadata_str = generate_package_index_and_metadata( @@ -341,8 +381,13 @@ if __name__ == "__main__": args = parser.parse_args() version = args.version - if "/" in version or "\\" in version: - raise ValueError("Version string must not contain slashes.") + # Allow rocm/ prefix, reject other slashes and all backslashes + if "\\" in version: + raise ValueError("Version string must not contain backslashes.") + if "/" in version and not version.startswith("rocm/"): + raise ValueError( + "Version string must not contain slashes (except for 'rocm/' prefix)." + ) current_objects_path = Path(args.current_objects) output_dir = Path(args.output_dir) if not output_dir.exists(): @@ -393,8 +438,23 @@ if __name__ == "__main__": # Generate index and metadata, assuming wheels and indices are stored as: # s3://vllm-wheels/{wheel_dir}/ # s3://vllm-wheels// - wheel_dir = args.wheel_dir or version - wheel_base_dir = Path(output_dir).parent / wheel_dir.strip().rstrip("/") + # + # For ROCm builds, version is "rocm/{commit}" and indices are uploaded to: + # - rocm/{commit}/ (same as wheels) + # - rocm/nightly/ + # - rocm/{version}/ + # All these are under the "rocm/" prefix, so relative paths should be + # relative to "rocm/", not the bucket root. + if args.wheel_dir: + # Explicit wheel-dir provided (e.g., for version-specific indices pointing to commit dir) + wheel_dir = args.wheel_dir.strip().rstrip("/") + elif version.startswith("rocm/"): + # For rocm/commit, wheel_base_dir should be just the commit part + # so relative path from rocm/0.12.0/rocm710/vllm/ -> ../../../{commit}/ + wheel_dir = version.split("/", 1)[1] + else: + wheel_dir = version + wheel_base_dir = Path(output_dir).parent / wheel_dir index_base_dir = Path(output_dir) generate_index_and_metadata( diff --git a/.buildkite/scripts/upload-rocm-wheels.sh b/.buildkite/scripts/upload-rocm-wheels.sh new file mode 100755 index 000000000..bb555bc84 --- /dev/null +++ b/.buildkite/scripts/upload-rocm-wheels.sh @@ -0,0 +1,151 @@ +#!/usr/bin/env bash +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +# +# Upload ROCm wheels to S3 with proper index generation +# +# Required environment variables: +# AWS_ACCESS_KEY_ID, AWS_SECRET_ACCESS_KEY (or IAM role) +# S3_BUCKET (default: vllm-wheels) +# +# S3 path structure: +# s3://vllm-wheels/rocm/{commit}/ - All wheels for this commit +# s3://vllm-wheels/rocm/nightly/ - Index pointing to latest nightly +# s3://vllm-wheels/rocm/{version}/ - Index for release versions + +set -ex + +# ======== Configuration ======== +BUCKET="${S3_BUCKET:-vllm-wheels}" +ROCM_SUBPATH="rocm/${BUILDKITE_COMMIT}" +S3_COMMIT_PREFIX="s3://$BUCKET/$ROCM_SUBPATH/" +INDICES_OUTPUT_DIR="rocm-indices" +PYTHON="${PYTHON_PROG:-python3}" + +# ROCm uses manylinux_2_35 (Ubuntu 22.04 based) +MANYLINUX_VERSION="manylinux_2_35" + +echo "========================================" +echo "ROCm Wheel Upload Configuration" +echo "========================================" +echo "S3 Bucket: $BUCKET" +echo "S3 Path: $ROCM_SUBPATH" +echo "Commit: $BUILDKITE_COMMIT" +echo "Branch: $BUILDKITE_BRANCH" +echo "========================================" + +# ======== Part 0: Setup Python ======== + +# Detect if python3.12+ is available +has_new_python=$($PYTHON -c "print(1 if __import__('sys').version_info >= (3,12) else 0)" 2>/dev/null || echo 0) +if [[ "$has_new_python" -eq 0 ]]; then + # Use new python from docker + # Use --user to ensure files are created with correct ownership (not root) + docker pull python:3-slim + PYTHON="docker run --rm --user $(id -u):$(id -g) -v $(pwd):/app -w /app python:3-slim python3" +fi + +echo "Using python interpreter: $PYTHON" +echo "Python version: $($PYTHON --version)" + +# ======== Part 1: Collect and prepare wheels ======== + +# Collect all wheels +mkdir -p all-rocm-wheels +cp artifacts/rocm-base-wheels/*.whl all-rocm-wheels/ 2>/dev/null || true +cp artifacts/rocm-vllm-wheel/*.whl all-rocm-wheels/ 2>/dev/null || true + +WHEEL_COUNT=$(ls all-rocm-wheels/*.whl 2>/dev/null | wc -l) +echo "Total wheels to upload: $WHEEL_COUNT" + +if [ "$WHEEL_COUNT" -eq 0 ]; then + echo "ERROR: No wheels found to upload!" + exit 1 +fi + +# Rename linux to manylinux in wheel filenames +for wheel in all-rocm-wheels/*.whl; do + if [[ "$wheel" == *"linux"* ]] && [[ "$wheel" != *"manylinux"* ]]; then + new_wheel="${wheel/linux/$MANYLINUX_VERSION}" + mv -- "$wheel" "$new_wheel" + echo "Renamed: $(basename "$wheel") -> $(basename "$new_wheel")" + fi +done + +echo "" +echo "Wheels to upload:" +ls -lh all-rocm-wheels/ + +# ======== Part 2: Upload wheels to S3 ======== + +echo "" +echo "Uploading wheels to $S3_COMMIT_PREFIX" +for wheel in all-rocm-wheels/*.whl; do + aws s3 cp "$wheel" "$S3_COMMIT_PREFIX" +done + +# ======== Part 3: Generate and upload indices ======== + +# List existing wheels in commit directory +echo "" +echo "Generating indices..." +obj_json="rocm-objects.json" +aws s3api list-objects-v2 --bucket "$BUCKET" --prefix "$ROCM_SUBPATH/" --delimiter / --output json > "$obj_json" + +mkdir -p "$INDICES_OUTPUT_DIR" + +# Use the existing generate-nightly-index.py +# HACK: Replace regex module with stdlib re (same as CUDA script) +sed -i 's/import regex as re/import re/g' .buildkite/scripts/generate-nightly-index.py + +$PYTHON .buildkite/scripts/generate-nightly-index.py \ + --version "$ROCM_SUBPATH" \ + --current-objects "$obj_json" \ + --output-dir "$INDICES_OUTPUT_DIR" \ + --comment "ROCm commit $BUILDKITE_COMMIT" + +# Upload indices to commit directory +echo "Uploading indices to $S3_COMMIT_PREFIX" +aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "$S3_COMMIT_PREFIX" + +# Update rocm/nightly/ if on main branch and not a PR +if [[ "$BUILDKITE_BRANCH" == "main" && "$BUILDKITE_PULL_REQUEST" == "false" ]] || [[ "$NIGHTLY" == "1" ]]; then + echo "Updating rocm/nightly/ index..." + aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/nightly/" +fi + +# Extract version from vLLM wheel and update version-specific index +VLLM_WHEEL=$(ls all-rocm-wheels/vllm*.whl 2>/dev/null | head -1) +if [ -n "$VLLM_WHEEL" ]; then + VERSION=$(unzip -p "$VLLM_WHEEL" '**/METADATA' | grep '^Version: ' | cut -d' ' -f2) + echo "Version in wheel: $VERSION" + PURE_VERSION="${VERSION%%+*}" + PURE_VERSION="${PURE_VERSION%%.rocm}" + echo "Pure version: $PURE_VERSION" + + if [[ "$VERSION" != *"dev"* ]]; then + echo "Updating rocm/$PURE_VERSION/ index..." + aws s3 cp --recursive "$INDICES_OUTPUT_DIR/" "s3://$BUCKET/rocm/$PURE_VERSION/" + fi +fi + +# ======== Part 4: Summary ======== + +echo "" +echo "========================================" +echo "ROCm Wheel Upload Complete!" +echo "========================================" +echo "" +echo "Wheels available at:" +echo " s3://$BUCKET/$ROCM_SUBPATH/" +echo "" +echo "Install command (by commit):" +echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/$ROCM_SUBPATH/" +echo "" +if [[ "$BUILDKITE_BRANCH" == "main" ]] || [[ "$NIGHTLY" == "1" ]]; then + echo "Install command (nightly):" + echo " pip install vllm --extra-index-url https://${BUCKET}.s3.amazonaws.com/rocm/nightly/" +fi +echo "" +echo "Wheel count: $WHEEL_COUNT" +echo "========================================" diff --git a/docker/Dockerfile.rocm b/docker/Dockerfile.rocm index 9adcf8c3a..475ae8282 100644 --- a/docker/Dockerfile.rocm +++ b/docker/Dockerfile.rocm @@ -3,6 +3,14 @@ ARG REMOTE_VLLM="0" ARG COMMON_WORKDIR=/app ARG BASE_IMAGE=rocm/vllm-dev:base +# Sccache configuration (only used in release pipeline) +ARG USE_SCCACHE +ARG SCCACHE_DOWNLOAD_URL +ARG SCCACHE_ENDPOINT +ARG SCCACHE_BUCKET_NAME=vllm-build-sccache +ARG SCCACHE_REGION_NAME=us-west-2 +ARG SCCACHE_S3_NO_CREDENTIALS=0 + FROM ${BASE_IMAGE} AS base ARG ARG_PYTORCH_ROCM_ARCH @@ -14,9 +22,14 @@ ENV RAY_EXPERIMENTAL_NOSET_HIP_VISIBLE_DEVICES=1 RUN apt-get update -q -y && apt-get install -q -y \ sqlite3 libsqlite3-dev libfmt-dev libmsgpack-dev libsuitesparse-dev \ apt-transport-https ca-certificates wget curl -# Remove sccache RUN python3 -m pip install --upgrade pip -RUN apt-get purge -y sccache; python3 -m pip uninstall -y sccache; rm -f "$(which sccache)" +# Remove sccache only if not using sccache (it exists in base image from Dockerfile.rocm_base) +ARG USE_SCCACHE +RUN if [ "$USE_SCCACHE" != "1" ]; then \ + apt-get purge -y sccache || true; \ + python3 -m pip uninstall -y sccache || true; \ + rm -f "$(which sccache)" || true; \ + fi # Install UV RUN curl -LsSf https://astral.sh/uv/install.sh | env UV_INSTALL_DIR="/usr/local/bin" sh @@ -28,6 +41,39 @@ ENV UV_INDEX_STRATEGY="unsafe-best-match" # Use copy mode to avoid hardlink failures with Docker cache mounts ENV UV_LINK_MODE=copy +# Install sccache if USE_SCCACHE is enabled (for release builds) +ARG USE_SCCACHE +ARG SCCACHE_DOWNLOAD_URL +ARG SCCACHE_ENDPOINT +ARG SCCACHE_BUCKET_NAME +ARG SCCACHE_REGION_NAME +ARG SCCACHE_S3_NO_CREDENTIALS +RUN if [ "$USE_SCCACHE" = "1" ]; then \ + if command -v sccache >/dev/null 2>&1; then \ + echo "sccache already installed, skipping installation"; \ + sccache --version; \ + else \ + echo "Installing sccache..." \ + && SCCACHE_ARCH="x86_64" \ + && SCCACHE_VERSION="v0.8.1" \ + && SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \ + && curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \ + && tar -xzf /tmp/sccache.tar.gz -C /tmp \ + && mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \ + && chmod +x /usr/bin/sccache \ + && rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \ + && sccache --version; \ + fi; \ + fi + +# Set sccache environment variables only when USE_SCCACHE=1 +# This prevents S3 config from leaking into images when sccache is not used +ARG USE_SCCACHE +ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}} +ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}} +ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}} +ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0} + ARG COMMON_WORKDIR WORKDIR ${COMMON_WORKDIR} @@ -51,7 +97,7 @@ FROM fetch_vllm_${REMOTE_VLLM} AS fetch_vllm # ----------------------- # vLLM build stages FROM fetch_vllm AS build_vllm -# Build vLLM +# Build vLLM (setup.py auto-detects sccache in PATH) RUN cd vllm \ && python3 -m pip install -r requirements/rocm.txt \ && python3 setup.py clean --all \ @@ -67,6 +113,178 @@ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/ COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite COPY --from=build_vllm ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1 +# RIXL/UCX build stages +FROM base AS build_rixl +ARG RIXL_BRANCH="f33a5599" +ARG RIXL_REPO="https://github.com/ROCm/RIXL.git" +ARG UCX_BRANCH="da3fac2a" +ARG UCX_REPO="https://github.com/ROCm/ucx.git" +ENV ROCM_PATH=/opt/rocm +ENV UCX_HOME=/usr/local/ucx +ENV RIXL_HOME=/usr/local/rixl +ENV RIXL_BENCH_HOME=/usr/local/rixl_bench + +# RIXL build system dependences and RDMA support +RUN apt-get -y update && apt-get -y install autoconf libtool pkg-config \ + libgrpc-dev \ + libgrpc++-dev \ + libprotobuf-dev \ + protobuf-compiler-grpc \ + libcpprest-dev \ + libaio-dev \ + librdmacm1 \ + librdmacm-dev \ + libibverbs1 \ + libibverbs-dev \ + ibverbs-utils \ + rdmacm-utils \ + ibverbs-providers \ + && rm -rf /var/lib/apt/lists/* + +RUN uv pip install --system meson auditwheel patchelf tomlkit + +RUN cd /usr/local/src && \ + git clone ${UCX_REPO} && \ + cd ucx && \ + git checkout ${UCX_BRANCH} && \ + ./autogen.sh && \ + mkdir build && cd build && \ + ../configure \ + --prefix=/usr/local/ucx \ + --enable-shared \ + --disable-static \ + --disable-doxygen-doc \ + --enable-optimizations \ + --enable-devel-headers \ + --with-rocm=/opt/rocm \ + --with-verbs \ + --with-dm \ + --enable-mt && \ + make -j && \ + make install + +ENV PATH=/usr/local/ucx/bin:$PATH +ENV LD_LIBRARY_PATH=${UCX_HOME}/lib:${LD_LIBRARY_PATH} + +RUN git clone ${RIXL_REPO} /opt/rixl && \ + cd /opt/rixl && \ + git checkout ${RIXL_BRANCH} && \ + meson setup build --prefix=${RIXL_HOME} \ + -Ducx_path=${UCX_HOME} \ + -Drocm_path=${ROCM_PATH} && \ + cd build && \ + ninja && \ + ninja install + +# Generate RIXL wheel +RUN cd /opt/rixl && mkdir -p /app/install && \ + ./contrib/build-wheel.sh \ + --output-dir /app/install \ + --rocm-dir ${ROCM_PATH} \ + --ucx-plugins-dir ${UCX_HOME}/lib/ucx \ + --nixl-plugins-dir ${RIXL_HOME}/lib/x86_64-linux-gnu/plugins + + +# ----------------------- +# vLLM wheel release build stage (for building distributable wheels) +# This stage pins dependencies to custom ROCm wheel versions and handles version detection +FROM fetch_vllm AS build_vllm_wheel_release + +ARG COMMON_WORKDIR + +# Create /install directory for custom wheels +RUN mkdir -p /install + +# Copy custom ROCm wheels from docker/context if they exist +# COPY ensures Docker cache is invalidated when wheels change +# .keep file ensures directory always exists for COPY to work +COPY docker/context/base-wheels/ /tmp/base-wheels/ +# This is how we know if we are building for a wheel release or not. +# If there are not wheels found there, we are not building for a wheel release. +# So we exit with an error. To skip this stage. +RUN if [ -n "$(ls /tmp/base-wheels/*.whl 2>/dev/null)" ]; then \ + echo "Found custom wheels - copying to /install"; \ + cp /tmp/base-wheels/*.whl /install/ && \ + echo "Copied custom wheels:"; \ + ls -lh /install/; \ + else \ + echo "ERROR: No custom wheels found in docker/context/base-wheels/"; \ + echo "Wheel releases require pre-built ROCm wheels."; \ + exit 1; \ + fi + +# GIT_REPO_CHECK: Verify repo is clean and tags are available (for release builds) +# This matches CUDA's Dockerfile behavior for proper version detection via setuptools_scm +ARG GIT_REPO_CHECK=0 +RUN if [ "$GIT_REPO_CHECK" != "0" ]; then \ + echo "Running repository checks..."; \ + cd vllm && bash tools/check_repo.sh; \ + fi + +# Extract version from git BEFORE any modifications (pin_rocm_dependencies.py modifies requirements/rocm.txt) +# This ensures setuptools_scm sees clean repo state for version detection +RUN --mount=type=bind,source=.git,target=vllm/.git \ + cd vllm \ + && pip install setuptools_scm \ + && VLLM_VERSION=$(python3 -c "import setuptools_scm; print(setuptools_scm.get_version())") \ + && echo "Detected vLLM version: ${VLLM_VERSION}" \ + && echo "${VLLM_VERSION}" > /tmp/vllm_version.txt + +# Fail if git-based package dependencies are found in requirements files +# (uv doesn't handle git+ URLs well, and packages should be distributed on PyPI) +# Extra notes: pip install is able to handle git+ URLs, but uv doesn't. +RUN echo "Checking for git-based packages in requirements files..." \ + && echo "Checking common.txt for git-based packages:" \ + && if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; then \ + echo "ERROR: Git-based packages found in common.txt:"; \ + grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/common.txt; \ + echo "Please publish these packages to PyPI instead of using git dependencies."; \ + exit 1; \ + else \ + echo " ✓ No git-based packages found in common.txt"; \ + fi \ + && echo "Checking rocm.txt for git-based packages:" \ + && if grep -q 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; then \ + echo "ERROR: Git-based packages found in rocm.txt:"; \ + grep 'git+' ${COMMON_WORKDIR}/vllm/requirements/rocm.txt; \ + echo "Please publish these packages to PyPI instead of using git dependencies."; \ + exit 1; \ + else \ + echo " ✓ No git-based packages found in rocm.txt"; \ + fi \ + && echo "All requirements files are clean - no git-based packages found" + +# Pin vLLM dependencies to exact versions of custom ROCm wheels +# This ensures 'pip install vllm' automatically installs correct torch/triton/torchvision/amdsmi +COPY tools/vllm-rocm/pin_rocm_dependencies.py /tmp/pin_rocm_dependencies.py +RUN echo "Pinning vLLM dependencies to custom wheel versions..." \ + && python3 /tmp/pin_rocm_dependencies.py /install ${COMMON_WORKDIR}/vllm/requirements/rocm.txt + +# Install dependencies using custom wheels from /install +RUN cd vllm \ + && echo "Building vLLM with custom wheels from /install" \ + && python3 -m pip install --find-links /install -r requirements/rocm.txt \ + && python3 setup.py clean --all + +# Build wheel using pre-extracted version to avoid dirty state from modified requirements/rocm.txt +# (setup.py auto-detects sccache in PATH) +RUN --mount=type=bind,source=.git,target=vllm/.git \ + cd vllm \ + && export SETUPTOOLS_SCM_PRETEND_VERSION=$(cat /tmp/vllm_version.txt) \ + && echo "Building wheel with version: ${SETUPTOOLS_SCM_PRETEND_VERSION}" \ + && python3 setup.py bdist_wheel --dist-dir=dist + +FROM scratch AS export_vllm_wheel_release +ARG COMMON_WORKDIR +COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/dist/*.whl / +COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/requirements /requirements +COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/benchmarks /benchmarks +COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/tests /tests +COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/examples /examples +COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/docker/Dockerfile.rocm /docker/ +COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/.buildkite /.buildkite +COPY --from=build_vllm_wheel_release ${COMMON_WORKDIR}/vllm/vllm/v1 /vllm_v1 + # ----------------------- # Test vLLM image FROM base AS test diff --git a/docker/Dockerfile.rocm_base b/docker/Dockerfile.rocm_base index b1b244db4..44eda4337 100644 --- a/docker/Dockerfile.rocm_base +++ b/docker/Dockerfile.rocm_base @@ -14,16 +14,13 @@ ARG AITER_REPO="https://github.com/ROCm/aiter.git" ARG MORI_BRANCH="2d02c6a9" ARG MORI_REPO="https://github.com/ROCm/mori.git" -#TODO: When patch has been upstreamed, switch to the main repo/branch -# ARG RIXL_BRANCH="" -# ARG RIXL_REPO="https://github.com/ROCm/RIXL.git" -ARG RIXL_BRANCH="50d63d94" -ARG RIXL_REPO="https://github.com/vcave/RIXL.git" -# Needed by RIXL -ARG ETCD_BRANCH="7c6e714f" -ARG ETCD_REPO="https://github.com/etcd-cpp-apiv3/etcd-cpp-apiv3.git" -ARG UCX_BRANCH="da3fac2a" -ARG UCX_REPO="https://github.com/ROCm/ucx.git" +# Sccache configuration (only used in release pipeline) +ARG USE_SCCACHE +ARG SCCACHE_DOWNLOAD_URL +ARG SCCACHE_ENDPOINT +ARG SCCACHE_BUCKET_NAME=vllm-build-sccache +ARG SCCACHE_REGION_NAME=us-west-2 +ARG SCCACHE_S3_NO_CREDENTIALS=0 FROM ${BASE_IMAGE} AS base @@ -64,6 +61,49 @@ RUN apt-get update -y \ RUN pip install -U packaging 'cmake<4' ninja wheel 'setuptools<80' pybind11 Cython RUN apt-get update && apt-get install -y libjpeg-dev libsox-dev libsox-fmt-all sox && rm -rf /var/lib/apt/lists/* +# Install sccache if USE_SCCACHE is enabled (for release builds) +ARG USE_SCCACHE +ARG SCCACHE_DOWNLOAD_URL +ARG SCCACHE_ENDPOINT +ARG SCCACHE_BUCKET_NAME +ARG SCCACHE_REGION_NAME +ARG SCCACHE_S3_NO_CREDENTIALS +RUN if [ "$USE_SCCACHE" = "1" ]; then \ + echo "Installing sccache..." \ + && SCCACHE_ARCH="x86_64" \ + && SCCACHE_VERSION="v0.8.1" \ + && SCCACHE_DL_URL="${SCCACHE_DOWNLOAD_URL:-https://github.com/mozilla/sccache/releases/download/${SCCACHE_VERSION}/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl.tar.gz}" \ + && curl -L -o /tmp/sccache.tar.gz ${SCCACHE_DL_URL} \ + && tar -xzf /tmp/sccache.tar.gz -C /tmp \ + && mv /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl/sccache /usr/bin/sccache \ + && chmod +x /usr/bin/sccache \ + && rm -rf /tmp/sccache.tar.gz /tmp/sccache-${SCCACHE_VERSION}-${SCCACHE_ARCH}-unknown-linux-musl \ + && sccache --version; \ + fi + +# Setup sccache for HIP compilation via HIP_CLANG_PATH +# This creates wrapper scripts in a separate directory and points HIP to use them +# This avoids modifying the original ROCm binaries which can break detection +# NOTE: HIP_CLANG_PATH is NOT set as ENV to avoid affecting downstream images (Dockerfile.rocm) +# Instead, each build stage should export HIP_CLANG_PATH=/opt/sccache-wrappers if USE_SCCACHE=1 +RUN if [ "$USE_SCCACHE" = "1" ]; then \ + echo "Setting up sccache wrappers for HIP compilation..." \ + && mkdir -p /opt/sccache-wrappers \ + && printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang++ "$@"\n' > /opt/sccache-wrappers/clang++ \ + && chmod +x /opt/sccache-wrappers/clang++ \ + && printf '#!/bin/bash\nexec sccache /opt/rocm/lib/llvm/bin/clang "$@"\n' > /opt/sccache-wrappers/clang \ + && chmod +x /opt/sccache-wrappers/clang \ + && echo "sccache wrappers created in /opt/sccache-wrappers"; \ + fi + +# Set sccache environment variables only when USE_SCCACHE=1 +# This prevents S3 config from leaking into images when sccache is not used +ARG USE_SCCACHE +ENV SCCACHE_BUCKET=${USE_SCCACHE:+${SCCACHE_BUCKET_NAME}} +ENV SCCACHE_REGION=${USE_SCCACHE:+${SCCACHE_REGION_NAME}} +ENV SCCACHE_S3_NO_CREDENTIALS=${USE_SCCACHE:+${SCCACHE_S3_NO_CREDENTIALS}} +ENV SCCACHE_IDLE_TIMEOUT=${USE_SCCACHE:+0} + ### ### Triton Build @@ -100,22 +140,42 @@ ARG PYTORCH_AUDIO_BRANCH ARG PYTORCH_REPO ARG PYTORCH_VISION_REPO ARG PYTORCH_AUDIO_REPO +ARG USE_SCCACHE RUN git clone ${PYTORCH_REPO} pytorch RUN cd pytorch && git checkout ${PYTORCH_BRANCH} \ && pip install -r requirements.txt && git submodule update --init --recursive \ && python3 tools/amd_build/build_amd.py \ + && if [ "$USE_SCCACHE" = "1" ]; then \ + export HIP_CLANG_PATH=/opt/sccache-wrappers \ + && export CMAKE_C_COMPILER_LAUNCHER=sccache \ + && export CMAKE_CXX_COMPILER_LAUNCHER=sccache \ + && sccache --show-stats; \ + fi \ && CMAKE_PREFIX_PATH=$(python3 -c 'import sys; print(sys.prefix)') python3 setup.py bdist_wheel --dist-dir=dist \ + && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \ && pip install dist/*.whl RUN git clone ${PYTORCH_VISION_REPO} vision RUN cd vision && git checkout ${PYTORCH_VISION_BRANCH} \ + && if [ "$USE_SCCACHE" = "1" ]; then \ + export HIP_CLANG_PATH=/opt/sccache-wrappers \ + && export CMAKE_C_COMPILER_LAUNCHER=sccache \ + && export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \ + fi \ && python3 setup.py bdist_wheel --dist-dir=dist \ + && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \ && pip install dist/*.whl RUN git clone ${PYTORCH_AUDIO_REPO} audio RUN cd audio && git checkout ${PYTORCH_AUDIO_BRANCH} \ && git submodule update --init --recursive \ && pip install -r requirements.txt \ + && if [ "$USE_SCCACHE" = "1" ]; then \ + export HIP_CLANG_PATH=/opt/sccache-wrappers \ + && export CMAKE_C_COMPILER_LAUNCHER=sccache \ + && export CMAKE_CXX_COMPILER_LAUNCHER=sccache; \ + fi \ && python3 setup.py bdist_wheel --dist-dir=dist \ + && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \ && pip install dist/*.whl RUN mkdir -p /app/install && cp /app/pytorch/dist/*.whl /app/install \ && cp /app/vision/dist/*.whl /app/install \ @@ -230,13 +290,19 @@ RUN cd /opt/rixl && mkdir -p /app/install && \ FROM base AS build_fa ARG FA_BRANCH ARG FA_REPO +ARG USE_SCCACHE RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ pip install /install/*.whl RUN git clone ${FA_REPO} RUN cd flash-attention \ && git checkout ${FA_BRANCH} \ && git submodule update --init \ - && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist + && if [ "$USE_SCCACHE" = "1" ]; then \ + export HIP_CLANG_PATH=/opt/sccache-wrappers \ + && sccache --show-stats; \ + fi \ + && GPU_ARCHS=$(echo ${PYTORCH_ROCM_ARCH} | sed -e 's/;gfx1[0-9]\{3\}//g') python3 setup.py bdist_wheel --dist-dir=dist \ + && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install @@ -246,6 +312,7 @@ RUN mkdir -p /app/install && cp /app/flash-attention/dist/*.whl /app/install FROM base AS build_aiter ARG AITER_BRANCH ARG AITER_REPO +ARG USE_SCCACHE RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ pip install /install/*.whl RUN git clone --recursive ${AITER_REPO} @@ -253,13 +320,37 @@ RUN cd aiter \ && git checkout ${AITER_BRANCH} \ && git submodule update --init --recursive \ && pip install -r requirements.txt -RUN pip install pyyaml && cd aiter && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist && ls /app/aiter/dist/*.whl +RUN pip install pyyaml && cd aiter \ + && if [ "$USE_SCCACHE" = "1" ]; then \ + export HIP_CLANG_PATH=/opt/sccache-wrappers \ + && sccache --show-stats; \ + fi \ + && PREBUILD_KERNELS=1 GPU_ARCHS=${AITER_ROCM_ARCH} python3 setup.py bdist_wheel --dist-dir=dist \ + && if [ "$USE_SCCACHE" = "1" ]; then sccache --show-stats; fi \ + && ls /app/aiter/dist/*.whl RUN mkdir -p /app/install && cp /app/aiter/dist/*.whl /app/install ### ### Final Build ### + +# Wheel release stage - +# only includes dependencies used by wheel release pipeline +FROM base AS debs_wheel_release +RUN mkdir /app/debs +RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ + cp /install/*.whl /app/debs +RUN --mount=type=bind,from=build_fa,src=/app/install/,target=/install \ + cp /install/*.whl /app/debs +RUN --mount=type=bind,from=build_amdsmi,src=/app/install/,target=/install \ + cp /install/*.whl /app/debs +RUN --mount=type=bind,from=build_pytorch,src=/app/install/,target=/install \ + cp /install/*.whl /app/debs +RUN --mount=type=bind,from=build_aiter,src=/app/install/,target=/install \ + cp /install/*.whl /app/debs + +# Full debs stage - includes Mori (used by Docker releases) FROM base AS debs RUN mkdir /app/debs RUN --mount=type=bind,from=build_triton,src=/app/install/,target=/install \ diff --git a/requirements/rocm-test.txt b/requirements/rocm-test.txt index feaf21f3d..be8622065 100644 --- a/requirements/rocm-test.txt +++ b/requirements/rocm-test.txt @@ -80,6 +80,8 @@ num2words==0.5.14 pqdm==0.2.0 # via lm-eval +# Required for fastsafetensors test +fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459 # Required for suffix decoding test arctic-inference == 0.1.1 # Required for Nemotron test diff --git a/requirements/rocm.txt b/requirements/rocm.txt index 673f23775..5aeb16599 100644 --- a/requirements/rocm.txt +++ b/requirements/rocm.txt @@ -15,5 +15,4 @@ setuptools-scm>=8 runai-model-streamer[s3,gcs]==0.15.3 conch-triton-kernels==1.2.1 timm>=1.0.17 -fastsafetensors @ git+https://github.com/foundation-model-stack/fastsafetensors.git@d6f998a03432b2452f8de2bb5cefb5af9795d459 grpcio-tools>=1.76.0 \ No newline at end of file diff --git a/tools/vllm-rocm/pin_rocm_dependencies.py b/tools/vllm-rocm/pin_rocm_dependencies.py new file mode 100644 index 000000000..ba11fd934 --- /dev/null +++ b/tools/vllm-rocm/pin_rocm_dependencies.py @@ -0,0 +1,221 @@ +#!/usr/bin/env python3 +# SPDX-License-Identifier: Apache-2.0 +# SPDX-FileCopyrightText: Copyright contributors to the vLLM project +""" +Pin vLLM dependencies to exact versions of custom ROCm wheels. + +This script modifies vLLM's requirements files to replace version constraints +with exact versions of custom-built ROCm wheels (torch, triton, torchvision, amdsmi). + +This ensures that 'pip install vllm' automatically installs the correct custom wheels +instead of allowing pip to download different versions from PyPI. +""" + +import re +import sys +from pathlib import Path + + +def extract_version_from_wheel(wheel_name: str) -> str: + """ + Extract version from wheel filename. + + Example: + torch-2.9.0a0+git1c57644-cp312-cp312-linux_x86_64.whl -> 2.9.0a0+git1c57644 + triton-3.4.0-cp312-cp312-linux_x86_64.whl -> 3.4.0 + """ + # Wheel format: + # {distribution}-{version}(-{build tag})?-{python}-{abi}-{platform}.whl + parts = wheel_name.replace(".whl", "").split("-") + + if len(parts) < 5: + raise ValueError(f"Invalid wheel filename format: {wheel_name}") + + # Version is the second part + version = parts[1] + return version + + +def get_custom_wheel_versions(install_dir: str) -> dict[str, str]: + """ + Read /install directory and extract versions of custom wheels. + + Returns: + Dict mapping package names to exact versions + """ + install_path = Path(install_dir) + if not install_path.exists(): + print(f"ERROR: Install directory not found: {install_dir}", file=sys.stderr) + sys.exit(1) + + versions = {} + + # Map wheel prefixes to package names + # IMPORTANT: Use dashes to avoid matching substrings + # (e.g., 'torch' would match 'torchvision') + # ORDER MATTERS: This order is preserved when pinning dependencies + # in requirements files + package_mapping = [ + ("torch-", "torch"), # Match torch- (not torchvision) + ("triton-", "triton"), # Match triton- (not triton_kernels) + ("triton_kernels-", "triton-kernels"), # Match triton_kernels- + ("torchvision-", "torchvision"), # Match torchvision- + ("torchaudio-", "torchaudio"), # Match torchaudio- + ("amdsmi-", "amdsmi"), # Match amdsmi- + ("flash_attn-", "flash-attn"), # Match flash_attn- + ("aiter-", "aiter"), # Match aiter- + ] + + for wheel_file in install_path.glob("*.whl"): + wheel_name = wheel_file.name + + for prefix, package_name in package_mapping: + if wheel_name.startswith(prefix): + try: + version = extract_version_from_wheel(wheel_name) + versions[package_name] = version + print(f"Found {package_name}=={version}", file=sys.stderr) + except Exception as e: + print( + f"WARNING: Could not extract version from {wheel_name}: {e}", + file=sys.stderr, + ) + break + + # Return versions in the order defined by package_mapping + ordered_versions = {} + for _, package_name in package_mapping: + if package_name in versions: + ordered_versions[package_name] = versions[package_name] + return ordered_versions + + +def pin_dependencies_in_requirements(requirements_path: str, versions: dict[str, str]): + """ + Insert custom wheel pins at the TOP of requirements file. + + This ensures that when setup.py processes the file line-by-line, + custom wheels (torch, triton, etc.) are encountered FIRST, before + any `-r common.txt` includes that might pull in other dependencies. + + Creates: + # Custom ROCm wheel pins (auto-generated) + torch==2.9.0a0+git1c57644 + triton==3.4.0 + torchvision==0.23.0a0+824e8c8 + amdsmi==26.1.0+5df6c765 + + -r common.txt + ... rest of file ... + """ + requirements_file = Path(requirements_path) + + if not requirements_file.exists(): + print( + f"ERROR: Requirements file not found: {requirements_path}", file=sys.stderr + ) + sys.exit(1) + + # Backup original file + backup_file = requirements_file.with_suffix(requirements_file.suffix + ".bak") + with open(requirements_file) as f: + original_lines = f.readlines() + + # Write backup + with open(backup_file, "w") as f: + f.writelines(original_lines) + + # Build header with pinned custom wheels + header_lines = [ + "# Custom ROCm wheel pins (auto-generated by pin_rocm_dependencies.py)\n", + "# These must come FIRST to ensure correct dependency resolution\n", + ] + + for package_name, exact_version in versions.items(): + header_lines.append(f"{package_name}=={exact_version}\n") + + header_lines.append("\n") # Blank line separator + + # Filter out any existing entries for custom packages from original file + filtered_lines = [] + removed_packages = [] + + for line in original_lines: + stripped = line.strip() + should_keep = True + + # Check if this line is for one of our custom packages + if stripped and not stripped.startswith("#") and not stripped.startswith("-"): + for package_name in versions: + # Handle both hyphen and underscore variations + pattern_name = package_name.replace("-", "[-_]") + pattern = rf"^{pattern_name}\s*[=<>]=?\s*[\d.a-zA-Z+]+" + + if re.match(pattern, stripped, re.IGNORECASE): + removed_packages.append(f"{package_name}: {stripped}") + should_keep = False + break + + if should_keep: + filtered_lines.append(line) + + # Combine: header + filtered original content + final_lines = header_lines + filtered_lines + + # Write modified content + with open(requirements_file, "w") as f: + f.writelines(final_lines) + + # Print summary + print("\n✓ Inserted custom wheel pins at TOP of requirements:", file=sys.stderr) + for package_name, exact_version in versions.items(): + print(f" - {package_name}=={exact_version}", file=sys.stderr) + + if removed_packages: + print("\n✓ Removed old package entries:", file=sys.stderr) + for pkg in removed_packages: + print(f" - {pkg}", file=sys.stderr) + + print(f"\n✓ Patched requirements file: {requirements_path}", file=sys.stderr) + print(f" Backup saved: {backup_file}", file=sys.stderr) + + +def main(): + if len(sys.argv) != 3: + print( + f"Usage: {sys.argv[0]} ", file=sys.stderr + ) + print( + f"Example: {sys.argv[0]} /install /app/vllm/requirements/rocm.txt", + file=sys.stderr, + ) + sys.exit(1) + + install_dir = sys.argv[1] + requirements_path = sys.argv[2] + + print("=" * 70, file=sys.stderr) + print("Pinning vLLM dependencies to custom ROCm wheel versions", file=sys.stderr) + print("=" * 70, file=sys.stderr) + + # Get versions from custom wheels + print(f"\nScanning {install_dir} for custom wheels...", file=sys.stderr) + versions = get_custom_wheel_versions(install_dir) + + if not versions: + print("\nERROR: No custom wheels found in /install!", file=sys.stderr) + sys.exit(1) + + # Pin dependencies in requirements file + print(f"\nPatching {requirements_path}...", file=sys.stderr) + pin_dependencies_in_requirements(requirements_path, versions) + + print("\n" + "=" * 70, file=sys.stderr) + print("✓ Dependency pinning complete!", file=sys.stderr) + print("=" * 70, file=sys.stderr) + + sys.exit(0) + + +if __name__ == "__main__": + main()